Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
open-r1
trl
grpo
conversational
text-generation-inference
Instructions to use kangdawei/DRA-GRPO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use kangdawei/DRA-GRPO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="kangdawei/DRA-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("kangdawei/DRA-GRPO") model = AutoModelForCausalLM.from_pretrained("kangdawei/DRA-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use kangdawei/DRA-GRPO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "kangdawei/DRA-GRPO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/DRA-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/kangdawei/DRA-GRPO
- SGLang
How to use kangdawei/DRA-GRPO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "kangdawei/DRA-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/DRA-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "kangdawei/DRA-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/DRA-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use kangdawei/DRA-GRPO with Docker Model Runner:
docker model run hf.co/kangdawei/DRA-GRPO
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.19501760601997375, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0, | |
| "reward": 0.08349451050162315, | |
| "reward_std": 0.14101681299507618, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.18392972648143768, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": -0.0, | |
| "reward": 0.04647743375971913, | |
| "reward_std": 0.071280462667346, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3326.5208435058594, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.16814576089382172, | |
| "kl": 4.73707914352417e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": -0.05517918919213116, | |
| "reward_std": 0.06846281955949962, | |
| "rewards/cosine_scaled_reward": -0.23461355827748775, | |
| "rewards/format_reward": 0.1458333395421505, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2271.854202270508, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.20840850472450256, | |
| "kl": 3.003329038619995e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.104565495159477, | |
| "reward_std": 0.1621289630420506, | |
| "rewards/cosine_scaled_reward": -0.016989090479910374, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3269.250030517578, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.20661523938179016, | |
| "kl": 3.783777356147766e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": -0.027549213060410693, | |
| "reward_std": 0.11831692652776837, | |
| "rewards/cosine_scaled_reward": -0.22735669882968068, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3207.125, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.25092315673828125, | |
| "kl": 4.430115222930908e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.0003576022572815418, | |
| "reward_std": 0.13864743057638407, | |
| "rewards/cosine_scaled_reward": -0.12470872118137777, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3157.375045776367, | |
| "epoch": 0.008, | |
| "grad_norm": 0.17134779691696167, | |
| "kl": 2.024322748184204e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.024647740297950804, | |
| "reward_std": 0.12343645561486483, | |
| "rewards/cosine_scaled_reward": -0.10497256461530924, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2640.6875610351562, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.1863798052072525, | |
| "kl": 1.722201704978943e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.11913361493498087, | |
| "reward_std": 0.13311758637428284, | |
| "rewards/cosine_scaled_reward": 0.08812191832112148, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3171.4166870117188, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.20052321255207062, | |
| "kl": 4.750490188598633e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.04684965265914798, | |
| "reward_std": 0.10927984630689025, | |
| "rewards/cosine_scaled_reward": -0.030095582733338233, | |
| "rewards/format_reward": 0.3333333469927311, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2619.479217529297, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.1900119185447693, | |
| "kl": 2.4830922484397888e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.0658606368524488, | |
| "reward_std": 0.13385961623862386, | |
| "rewards/cosine_scaled_reward": -0.01448088325560093, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3247.4583740234375, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.15901124477386475, | |
| "kl": 2.709031105041504e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": -0.04511611349880695, | |
| "reward_std": 0.10903473664075136, | |
| "rewards/cosine_scaled_reward": -0.21695117373019457, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2527.062515258789, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.21673277020454407, | |
| "kl": 4.7400593757629395e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.07390611409209669, | |
| "reward_std": 0.13868055026978254, | |
| "rewards/cosine_scaled_reward": -0.0959229115396738, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2923.7083740234375, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.18947269022464752, | |
| "kl": 3.428012132644653e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.035209502559155226, | |
| "reward_std": 0.11472400068305433, | |
| "rewards/cosine_scaled_reward": -0.0937328040599823, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2934.9166870117188, | |
| "epoch": 0.016, | |
| "grad_norm": 0.24321378767490387, | |
| "kl": 3.0156224966049194e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.022672508843243122, | |
| "reward_std": 0.12115806993097067, | |
| "rewards/cosine_scaled_reward": -0.1290947226807475, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2780.0833587646484, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.2025020718574524, | |
| "kl": 2.3249536752700806e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": 0.07990049698855728, | |
| "reward_std": 0.07795312092639506, | |
| "rewards/cosine_scaled_reward": 0.005295965820550919, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3492.312530517578, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.18354374170303345, | |
| "kl": 4.486739635467529e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.06231466308236122, | |
| "reward_std": 0.08755358215421438, | |
| "rewards/cosine_scaled_reward": -0.22571247071027756, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2405.56254196167, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.2691311836242676, | |
| "kl": 3.308616578578949e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.07129185972735286, | |
| "reward_std": 0.13615732779726386, | |
| "rewards/cosine_scaled_reward": -0.07220900629181415, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2926.854232788086, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.16645942628383636, | |
| "kl": 2.5579705834388733e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.018077057087793946, | |
| "reward_std": 0.09778019832447171, | |
| "rewards/cosine_scaled_reward": -0.12368878477718681, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2740.2083740234375, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.36254823207855225, | |
| "kl": 2.2433698177337646e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.11815963860135525, | |
| "reward_std": 0.17526093125343323, | |
| "rewards/cosine_scaled_reward": 0.10508107638452202, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2277.77091217041, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.19969557225704193, | |
| "kl": 1.2811273336410522e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.12608362920582294, | |
| "reward_std": 0.16070688236504793, | |
| "rewards/cosine_scaled_reward": 0.003691190853714943, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2652.770896911621, | |
| "epoch": 0.024, | |
| "grad_norm": 0.28585541248321533, | |
| "kl": 4.544854164123535e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.02955322596244514, | |
| "reward_std": 0.12703021708875895, | |
| "rewards/cosine_scaled_reward": -0.10260925628244877, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1915.7292022705078, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.2661941349506378, | |
| "kl": 2.355128526687622e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.11611313896719366, | |
| "reward_std": 0.1277361772954464, | |
| "rewards/cosine_scaled_reward": -0.013740219175815582, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2549.1458892822266, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.2124091237783432, | |
| "kl": 3.099720925092697e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.05105366797943134, | |
| "reward_std": 0.1019338914193213, | |
| "rewards/cosine_scaled_reward": -0.10208869446069002, | |
| "rewards/format_reward": 0.500000013038516, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2841.8333740234375, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.23832456767559052, | |
| "kl": 2.9023736715316772e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.055271712597459555, | |
| "reward_std": 0.15443201549351215, | |
| "rewards/cosine_scaled_reward": -0.05593502405099571, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2734.6250228881836, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.20576398074626923, | |
| "kl": 4.264712333679199e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.03582445718348026, | |
| "reward_std": 0.14606032520532608, | |
| "rewards/cosine_scaled_reward": -0.0837564684334211, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 2954.3958740234375, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.16195496916770935, | |
| "kl": 3.225356340408325e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.06916803470812738, | |
| "reward_std": 0.08843644242733717, | |
| "rewards/cosine_scaled_reward": -0.023365769535303116, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2985.125045776367, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.18544963002204895, | |
| "kl": 2.6202760636806488e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.0820373228052631, | |
| "reward_std": 0.15796185052022338, | |
| "rewards/cosine_scaled_reward": -0.017732856795191765, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2774.625015258789, | |
| "epoch": 0.032, | |
| "grad_norm": 0.1868346929550171, | |
| "kl": 3.4227967262268066e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.09084612363949418, | |
| "reward_std": 0.08862769743427634, | |
| "rewards/cosine_scaled_reward": 0.03848753869533539, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3335.4791870117188, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.1792682260274887, | |
| "kl": 2.7485191822052002e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.04747861542273313, | |
| "reward_std": 0.10171156749129295, | |
| "rewards/cosine_scaled_reward": -0.22336439974606037, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3182.291717529297, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.20437775552272797, | |
| "kl": 2.625095658004284e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.04219546925742179, | |
| "reward_std": 0.16496449895203114, | |
| "rewards/cosine_scaled_reward": -0.04208953632041812, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3098.729202270508, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.22590124607086182, | |
| "kl": 3.269314765930176e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.005523839652596507, | |
| "reward_std": 0.14406571350991726, | |
| "rewards/cosine_scaled_reward": -0.10929703339934349, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3180.729248046875, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.1668621450662613, | |
| "kl": 2.413243055343628e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.02685558469966054, | |
| "reward_std": 0.11427591601386666, | |
| "rewards/cosine_scaled_reward": -0.07639202522113919, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3349.291748046875, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.14419840276241302, | |
| "kl": 3.168731927871704e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.039358544163405895, | |
| "reward_std": 0.16947886534035206, | |
| "rewards/cosine_scaled_reward": -0.06123522081179544, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2380.9167098999023, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.22925761342048645, | |
| "kl": 2.7898699045181274e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.10769698303192854, | |
| "reward_std": 0.1343193519860506, | |
| "rewards/cosine_scaled_reward": 0.024825414642691612, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 2984.8959197998047, | |
| "epoch": 0.04, | |
| "grad_norm": 0.22632969915866852, | |
| "kl": 3.349222242832184e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": 0.008388399612158537, | |
| "reward_std": 0.10677651688456535, | |
| "rewards/cosine_scaled_reward": -0.172616648953408, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3379.3333740234375, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.1803259551525116, | |
| "kl": 3.5781413316726685e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.04249414807418361, | |
| "reward_std": 0.11586784245446324, | |
| "rewards/cosine_scaled_reward": -0.21885607368312776, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3416.3541870117188, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.14628717303276062, | |
| "kl": 1.6780570149421692e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.050946102710440755, | |
| "reward_std": 0.07268051384016871, | |
| "rewards/cosine_scaled_reward": -0.22305289562791586, | |
| "rewards/format_reward": 0.1458333395421505, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3168.875015258789, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.15368561446666718, | |
| "kl": 2.38809734582901e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.01665067719295621, | |
| "reward_std": 0.06848278688266873, | |
| "rewards/cosine_scaled_reward": -0.14153539016842842, | |
| "rewards/format_reward": 0.1875, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2897.9583740234375, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.17705340683460236, | |
| "kl": 1.7940998077392578e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.05753035913221538, | |
| "reward_std": 0.07578674505930394, | |
| "rewards/cosine_scaled_reward": -0.04679988697171211, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2554.7708892822266, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.22783087193965912, | |
| "kl": 2.1063722670078278e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.04392236044805031, | |
| "reward_std": 0.11113758757710457, | |
| "rewards/cosine_scaled_reward": -0.11080039292573929, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 2975.8958740234375, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.22396467626094818, | |
| "kl": 1.806439831852913e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.02207160711986944, | |
| "reward_std": 0.12583239562809467, | |
| "rewards/cosine_scaled_reward": -0.24189986009150743, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2725.9166984558105, | |
| "epoch": 0.048, | |
| "grad_norm": 0.27289333939552307, | |
| "kl": 3.649294376373291e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.031094663951080292, | |
| "reward_std": 0.07978959055617452, | |
| "rewards/cosine_scaled_reward": -0.28885440342128277, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 2959.916702270508, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.20727042853832245, | |
| "kl": 2.0106323063373566e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.02459817798808217, | |
| "reward_std": 0.1277715265750885, | |
| "rewards/cosine_scaled_reward": -0.08350535575300455, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2700.6458740234375, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.2838585376739502, | |
| "kl": 5.9839338064193726e-05, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.07408008817583323, | |
| "reward_std": 0.1297681350260973, | |
| "rewards/cosine_scaled_reward": -0.011715584434568882, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3354.8125610351562, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1483648717403412, | |
| "kl": 1.9919127225875854e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": 0.042736097471788526, | |
| "reward_std": 0.13064279220998287, | |
| "rewards/cosine_scaled_reward": -0.017792840400943533, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3156.3959045410156, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.18322643637657166, | |
| "kl": 2.984795719385147e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.0388933519134298, | |
| "reward_std": 0.09174182126298547, | |
| "rewards/cosine_scaled_reward": -0.21921097254380584, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2675.1042098999023, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.22902333736419678, | |
| "kl": 3.225822001695633e-05, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.11461905902251601, | |
| "reward_std": 0.14914221363142133, | |
| "rewards/cosine_scaled_reward": 0.06888513453304768, | |
| "rewards/format_reward": 0.5416666809469461, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2940.250015258789, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.1693173050880432, | |
| "kl": 9.981123730540276e-05, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.029834913242666516, | |
| "reward_std": 0.1260078912600875, | |
| "rewards/cosine_scaled_reward": -0.07712742034345865, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2305.0000381469727, | |
| "epoch": 0.056, | |
| "grad_norm": 0.2164977639913559, | |
| "kl": 7.113814353942871e-05, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.06258401460945606, | |
| "reward_std": 0.12052545137703419, | |
| "rewards/cosine_scaled_reward": -0.0762196818832308, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2923.770866394043, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.165934756398201, | |
| "kl": 5.564838647842407e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.02972496929578483, | |
| "reward_std": 0.1184019073843956, | |
| "rewards/cosine_scaled_reward": -0.07874834412359633, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2164.1250076293945, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.28015246987342834, | |
| "kl": 0.00023446721024811268, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.06518604746088386, | |
| "reward_std": 0.08450535265728831, | |
| "rewards/cosine_scaled_reward": -0.08784076571464539, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2815.583396911621, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.22428154945373535, | |
| "kl": 0.00012213829904794693, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.08840094119659625, | |
| "reward_std": 0.17651066416874528, | |
| "rewards/cosine_scaled_reward": 0.02174525521695614, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2765.291717529297, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.20454645156860352, | |
| "kl": 0.00013817846775054932, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 0.13577738002641127, | |
| "reward_std": 0.15497551951557398, | |
| "rewards/cosine_scaled_reward": 0.10726968757808208, | |
| "rewards/format_reward": 0.5833333376795053, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2602.104217529297, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.17332378029823303, | |
| "kl": 6.633996963500977e-05, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.17019405495375395, | |
| "reward_std": 0.14906789222732186, | |
| "rewards/cosine_scaled_reward": 0.18696625716984272, | |
| "rewards/format_reward": 0.6250000167638063, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2931.2708587646484, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.1711331158876419, | |
| "kl": 8.093938231468201e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.06622584909200668, | |
| "reward_std": 0.12079534726217389, | |
| "rewards/cosine_scaled_reward": -0.012890823185443878, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2785.833396911621, | |
| "epoch": 0.064, | |
| "grad_norm": 0.18859043717384338, | |
| "kl": 3.4786760807037354e-05, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.058590125758200884, | |
| "reward_std": 0.0908779576420784, | |
| "rewards/cosine_scaled_reward": -0.045839957892894745, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3269.0208740234375, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.12594451010227203, | |
| "kl": 1.267390325665474e-05, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": 0.023180216550827026, | |
| "reward_std": 0.12568830093368888, | |
| "rewards/cosine_scaled_reward": -0.09830697299912572, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2111.645866394043, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.2272089719772339, | |
| "kl": 0.0004666820168495178, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.1490978323854506, | |
| "reward_std": 0.143410362303257, | |
| "rewards/cosine_scaled_reward": 0.06337156053632498, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2783.1250228881836, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.17396898567676544, | |
| "kl": 2.6125460863113403e-05, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.01869871746748686, | |
| "reward_std": 0.12852068059146404, | |
| "rewards/cosine_scaled_reward": -0.1118423049338162, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2860.500045776367, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.1613863706588745, | |
| "kl": 6.277300417423248e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.036287183640524745, | |
| "reward_std": 0.12081348802894354, | |
| "rewards/cosine_scaled_reward": -0.10168062068987638, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2981.479217529297, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.16875192523002625, | |
| "kl": 0.00013221928384155035, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.050741570768877864, | |
| "reward_std": 0.14027578197419643, | |
| "rewards/cosine_scaled_reward": -0.10068160435184836, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2615.1042251586914, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.19548830389976501, | |
| "kl": 0.0005001500248908997, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.09628129447810352, | |
| "reward_std": 0.09381117532029748, | |
| "rewards/cosine_scaled_reward": -0.006604377180337906, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2180.791763305664, | |
| "epoch": 0.072, | |
| "grad_norm": 0.2137078046798706, | |
| "kl": 0.0017590373754501343, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0001, | |
| "reward": 0.18858059402555227, | |
| "reward_std": 0.1375174829736352, | |
| "rewards/cosine_scaled_reward": 0.18239261582493782, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2905.479217529297, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.17932051420211792, | |
| "kl": 0.0002357116900384426, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.041001017816597596, | |
| "reward_std": 0.1474486207589507, | |
| "rewards/cosine_scaled_reward": -0.05582154542207718, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2707.2917137145996, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.18318000435829163, | |
| "kl": 0.0001243998558493331, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.05281507736071944, | |
| "reward_std": 0.10035080322995782, | |
| "rewards/cosine_scaled_reward": -0.08306887093931437, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2048.8333435058594, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.2723003029823303, | |
| "kl": 0.0007075890898704529, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.08446738217025995, | |
| "reward_std": 0.12608648044988513, | |
| "rewards/cosine_scaled_reward": -0.031810659915208817, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3272.250015258789, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.14235161244869232, | |
| "kl": 0.0003021508455276489, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": -0.09321667347103357, | |
| "reward_std": 0.06564409867860377, | |
| "rewards/cosine_scaled_reward": -0.358583465218544, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1971.250015258789, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.25673460960388184, | |
| "kl": 0.001732461154460907, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10094248503446579, | |
| "reward_std": 0.14019617764279246, | |
| "rewards/cosine_scaled_reward": -0.018839816562831402, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2486.1666870117188, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.24227525293827057, | |
| "kl": 0.0012736618518829346, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0001, | |
| "reward": 0.002093482413329184, | |
| "reward_std": 0.1131673906929791, | |
| "rewards/cosine_scaled_reward": -0.22560235299170017, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3097.7708740234375, | |
| "epoch": 0.08, | |
| "grad_norm": 0.16386379301548004, | |
| "kl": 0.0010862918570637703, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.003557512885890901, | |
| "reward_std": 0.07532872771844268, | |
| "rewards/cosine_scaled_reward": -0.1663934402167797, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2676.9166946411133, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.3169231712818146, | |
| "kl": 0.0008103922009468079, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.057090925984084606, | |
| "reward_std": 0.12368696788325906, | |
| "rewards/cosine_scaled_reward": -0.009727515280246735, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2502.604202270508, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.22013245522975922, | |
| "kl": 0.0007383376359939575, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": 0.034887210465967655, | |
| "reward_std": 0.1007420509122312, | |
| "rewards/cosine_scaled_reward": -0.14844494126737118, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3457.125, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.14216576516628265, | |
| "kl": 0.00022399425506591797, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": 0.0063769330736249685, | |
| "reward_std": 0.0942028573481366, | |
| "rewards/cosine_scaled_reward": -0.07475030946079642, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3151.3125610351562, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.16055600345134735, | |
| "kl": 0.0008836947381496429, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.08290100377053022, | |
| "reward_std": 0.16621626261621714, | |
| "rewards/cosine_scaled_reward": 0.06829957757145166, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2834.812530517578, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.17003442347049713, | |
| "kl": 0.0006369650363922119, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": 0.05851969541981816, | |
| "reward_std": 0.09551909612491727, | |
| "rewards/cosine_scaled_reward": -0.021192173473536968, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2987.1458435058594, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.17524327337741852, | |
| "kl": 0.0001463182270526886, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": -0.009090241976082325, | |
| "reward_std": 0.06974720861762762, | |
| "rewards/cosine_scaled_reward": -0.21435680365539156, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3154.2708740234375, | |
| "epoch": 0.088, | |
| "grad_norm": 0.1655508130788803, | |
| "kl": 0.0002579297870397568, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": -0.01178191090002656, | |
| "reward_std": 0.10129856411367655, | |
| "rewards/cosine_scaled_reward": -0.1495479578152299, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3120.6041870117188, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.1709078699350357, | |
| "kl": 0.0009910427033901215, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.03289509087335318, | |
| "reward_std": 0.09486977197229862, | |
| "rewards/cosine_scaled_reward": -0.06091844476759434, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2322.7708702087402, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.2289884388446808, | |
| "kl": 0.001061469316482544, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": 0.08358410373330116, | |
| "reward_std": 0.09495366807095706, | |
| "rewards/cosine_scaled_reward": -0.00746832974255085, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3293.2083740234375, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.18912553787231445, | |
| "kl": 0.0006625503301620483, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": -0.006389252142980695, | |
| "reward_std": 0.13851621747016907, | |
| "rewards/cosine_scaled_reward": -0.1551999393850565, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3064.145866394043, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.22587524354457855, | |
| "kl": 0.002162039279937744, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0001, | |
| "reward": -0.021476033609360456, | |
| "reward_std": 0.09152220841497183, | |
| "rewards/cosine_scaled_reward": -0.19870448019355536, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2900.9791717529297, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.19733589887619019, | |
| "kl": 0.000873371958732605, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.03715384565293789, | |
| "reward_std": 0.1097968677058816, | |
| "rewards/cosine_scaled_reward": -0.05757363699376583, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2747.500045776367, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.23122522234916687, | |
| "kl": 0.0008766204118728638, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": 0.021435680333524942, | |
| "reward_std": 0.12779827043414116, | |
| "rewards/cosine_scaled_reward": -0.11562954680994153, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3090.750045776367, | |
| "epoch": 0.096, | |
| "grad_norm": 0.1742715984582901, | |
| "kl": 0.00035139918327331543, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.061643086373806, | |
| "reward_std": 0.16731480974704027, | |
| "rewards/cosine_scaled_reward": -0.016695552330929786, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 2862.5000762939453, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.15780915319919586, | |
| "kl": 0.00032967329025268555, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.059395642252638936, | |
| "reward_std": 0.18405951745808125, | |
| "rewards/cosine_scaled_reward": -0.07413570675998926, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2822.458366394043, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.18948958814144135, | |
| "kl": 0.0012461543083190918, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.058563592843711376, | |
| "reward_std": 0.13514422718435526, | |
| "rewards/cosine_scaled_reward": -0.023547479882836342, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2582.0208892822266, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.24197638034820557, | |
| "kl": 0.001571571920067072, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06321019981987774, | |
| "reward_std": 0.1301463134586811, | |
| "rewards/cosine_scaled_reward": -0.0854906840249896, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2717.666717529297, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.2206323742866516, | |
| "kl": 0.0015416741371154785, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10002893407363445, | |
| "reward_std": 0.15120992343872786, | |
| "rewards/cosine_scaled_reward": 0.02375661302357912, | |
| "rewards/format_reward": 0.5416666902601719, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3077.7708587646484, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.18466134369373322, | |
| "kl": 0.0015412569046020508, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0001, | |
| "reward": 0.019450924504781142, | |
| "reward_std": 0.1520949569530785, | |
| "rewards/cosine_scaled_reward": -0.10057051916373894, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2275.2708587646484, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.317700058221817, | |
| "kl": 0.0030652284622192383, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0001, | |
| "reward": 0.020127289928495884, | |
| "reward_std": 0.09609310049563646, | |
| "rewards/cosine_scaled_reward": -0.2033098302781582, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3104.2291870117188, | |
| "epoch": 0.104, | |
| "grad_norm": 0.16374754905700684, | |
| "kl": 0.0012220889329910278, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0, | |
| "reward": 0.02835351601243019, | |
| "reward_std": 0.0958831796888262, | |
| "rewards/cosine_scaled_reward": -0.08190420269966125, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2564.9583892822266, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.23695100843906403, | |
| "kl": 0.0031346678733825684, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0001, | |
| "reward": 0.031697872560471296, | |
| "reward_std": 0.1256707413122058, | |
| "rewards/cosine_scaled_reward": -0.1565206847153604, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3429.2916870117188, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.18266721069812775, | |
| "kl": 0.0016658008098602295, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0001, | |
| "reward": -0.06380185973830521, | |
| "reward_std": 0.07419165363535285, | |
| "rewards/cosine_scaled_reward": -0.21879931539297104, | |
| "rewards/format_reward": 0.06250000186264515, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 2926.041717529297, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.183439701795578, | |
| "kl": 0.0018378198146820068, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0001, | |
| "reward": 0.02929869778745342, | |
| "reward_std": 0.0936040470842272, | |
| "rewards/cosine_scaled_reward": -0.0937141003087163, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3466.7708740234375, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.1366991549730301, | |
| "kl": 0.0007553547620773315, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": -0.011318721110001206, | |
| "reward_std": 0.11944798147305846, | |
| "rewards/cosine_scaled_reward": -0.1375442687422037, | |
| "rewards/format_reward": 0.20833333767950535, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2708.729202270508, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.17744414508342743, | |
| "kl": 0.002089708112180233, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08605838101357222, | |
| "reward_std": 0.11997871845960617, | |
| "rewards/cosine_scaled_reward": 0.024741460452787578, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3173.854248046875, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.18968307971954346, | |
| "kl": 0.0014801472425460815, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0001, | |
| "reward": 0.031303239753469825, | |
| "reward_std": 0.11792595777660608, | |
| "rewards/cosine_scaled_reward": -0.06253299303352833, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2743.937545776367, | |
| "epoch": 0.112, | |
| "grad_norm": 0.17238333821296692, | |
| "kl": 0.0005875229835510254, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.03780742874369025, | |
| "reward_std": 0.09086114913225174, | |
| "rewards/cosine_scaled_reward": -0.14788446575403214, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2848.7083587646484, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.24863041937351227, | |
| "kl": 0.0012496709823608398, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0001, | |
| "reward": 0.007481225358787924, | |
| "reward_std": 0.09957948396913707, | |
| "rewards/cosine_scaled_reward": -0.11191552877426147, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2628.6458740234375, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.2551541328430176, | |
| "kl": 0.007792949676513672, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0003, | |
| "reward": 0.11142967082560062, | |
| "reward_std": 0.1601211791858077, | |
| "rewards/cosine_scaled_reward": 0.0880345068871975, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2490.229202270508, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.21603932976722717, | |
| "kl": 0.0017938017845153809, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05852094758301973, | |
| "reward_std": 0.09475950035266578, | |
| "rewards/cosine_scaled_reward": -0.0666491650044918, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2286.395866394043, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.22003820538520813, | |
| "kl": 0.0035685300827026367, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06951136235147715, | |
| "reward_std": 0.11986843310296535, | |
| "rewards/cosine_scaled_reward": -0.09702310990542173, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2706.5833854675293, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.24211692810058594, | |
| "kl": 0.0033190250396728516, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0001, | |
| "reward": 0.02106085862033069, | |
| "reward_std": 0.13761026575230062, | |
| "rewards/cosine_scaled_reward": -0.1569200656376779, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2745.125011444092, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.2112489640712738, | |
| "kl": 0.0039566755294799805, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0002, | |
| "reward": 0.013099167263135314, | |
| "reward_std": 0.1101553007028997, | |
| "rewards/cosine_scaled_reward": -0.14944082498550415, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2631.812545776367, | |
| "epoch": 0.12, | |
| "grad_norm": 0.19800451397895813, | |
| "kl": 0.002551555633544922, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08774281479418278, | |
| "reward_std": 0.15658214688301086, | |
| "rewards/cosine_scaled_reward": 0.02480911184102297, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2240.000026702881, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.1994587928056717, | |
| "kl": 0.0023127198219299316, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1655233004130423, | |
| "reward_std": 0.13684139621909708, | |
| "rewards/cosine_scaled_reward": 0.17548487707972527, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2713.1250915527344, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.23116803169250488, | |
| "kl": 0.0034139156341552734, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07083619991317391, | |
| "reward_std": 0.12138741742819548, | |
| "rewards/cosine_scaled_reward": -0.05151521973311901, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2540.8125762939453, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.20976760983467102, | |
| "kl": 0.0018079280853271484, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07085073599591851, | |
| "reward_std": 0.15451500099152327, | |
| "rewards/cosine_scaled_reward": -0.041776820085942745, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3043.500045776367, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.16290034353733063, | |
| "kl": 0.0009417533874511719, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 0.03415968408808112, | |
| "reward_std": 0.09731752565130591, | |
| "rewards/cosine_scaled_reward": -0.08652829378843307, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2716.9583587646484, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.202264204621315, | |
| "kl": 0.0016897767782211304, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0001, | |
| "reward": 0.030952309258282185, | |
| "reward_std": 0.13009351352229714, | |
| "rewards/cosine_scaled_reward": -0.11608889419585466, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3076.3959045410156, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.1983112394809723, | |
| "kl": 0.002537250518798828, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0001, | |
| "reward": 0.059007523115724325, | |
| "reward_std": 0.16636842489242554, | |
| "rewards/cosine_scaled_reward": -0.0232586320489645, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3080.041717529297, | |
| "epoch": 0.128, | |
| "grad_norm": 0.15135328471660614, | |
| "kl": 0.0012042894959449768, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.06036388734355569, | |
| "reward_std": 0.11590251373127103, | |
| "rewards/cosine_scaled_reward": -0.008873747196048498, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2676.9583435058594, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.2907523810863495, | |
| "kl": 0.0033299922943115234, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0001, | |
| "reward": 0.021284373477101326, | |
| "reward_std": 0.12445678655058146, | |
| "rewards/cosine_scaled_reward": -0.12851847242563963, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2599.645866394043, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.21558967232704163, | |
| "kl": 0.0046939849853515625, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0002, | |
| "reward": 0.030540801119059324, | |
| "reward_std": 0.09902108740061522, | |
| "rewards/cosine_scaled_reward": -0.19062614813446999, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2822.729179382324, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.20229722559452057, | |
| "kl": 0.0031037330627441406, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0001, | |
| "reward": 0.016621847171336412, | |
| "reward_std": 0.1235029874369502, | |
| "rewards/cosine_scaled_reward": -0.14894464937970042, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3348.0833435058594, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.17120520770549774, | |
| "kl": 0.0019791126251220703, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0001, | |
| "reward": -0.046343902591615915, | |
| "reward_std": 0.09335399139672518, | |
| "rewards/cosine_scaled_reward": -0.2216574940830469, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 2949.8958740234375, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.27388995885849, | |
| "kl": 0.0031032562255859375, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0001, | |
| "reward": -0.03218040708452463, | |
| "reward_std": 0.08237923681735992, | |
| "rewards/cosine_scaled_reward": -0.22971701715141535, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2783.3542098999023, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.17567184567451477, | |
| "kl": 0.001497507095336914, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0001, | |
| "reward": 0.12685386650264263, | |
| "reward_std": 0.13957502879202366, | |
| "rewards/cosine_scaled_reward": 0.1258874498307705, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2406.875030517578, | |
| "epoch": 0.136, | |
| "grad_norm": 0.2555752098560333, | |
| "kl": 0.004542350769042969, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0002, | |
| "reward": 0.094550846144557, | |
| "reward_std": 0.12869207374751568, | |
| "rewards/cosine_scaled_reward": -0.0038661109283566475, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2285.6250762939453, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.23725992441177368, | |
| "kl": 0.002517223358154297, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08064356981776655, | |
| "reward_std": 0.12271030526608229, | |
| "rewards/cosine_scaled_reward": -0.0756605202332139, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1854.8542022705078, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.2627831697463989, | |
| "kl": 0.003121614456176758, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0001, | |
| "reward": 0.11522941256407648, | |
| "reward_std": 0.12531911802943796, | |
| "rewards/cosine_scaled_reward": -0.03698302572593093, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2925.1458892822266, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.18577584624290466, | |
| "kl": 0.0021719932556152344, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07248353259637952, | |
| "reward_std": 0.15761876897886395, | |
| "rewards/cosine_scaled_reward": 0.006229955703020096, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2714.7708740234375, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.17778825759887695, | |
| "kl": 0.002131819725036621, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0001, | |
| "reward": 0.04195330070797354, | |
| "reward_std": 0.11853250442072749, | |
| "rewards/cosine_scaled_reward": -0.11678100191056728, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2183.958381652832, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.2191576361656189, | |
| "kl": 0.005345821380615234, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0002, | |
| "reward": 0.0987518366164295, | |
| "reward_std": 0.1706788558512926, | |
| "rewards/cosine_scaled_reward": -0.011822802014648914, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2888.354202270508, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.16155032813549042, | |
| "kl": 0.0020842552185058594, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0001, | |
| "reward": 0.04314285283908248, | |
| "reward_std": 0.1018235448282212, | |
| "rewards/cosine_scaled_reward": -0.029325059265829623, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2848.166702270508, | |
| "epoch": 0.144, | |
| "grad_norm": 0.1632193773984909, | |
| "kl": 0.0012958049774169922, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05145277862902731, | |
| "reward_std": 0.1218188302591443, | |
| "rewards/cosine_scaled_reward": -0.06772069446742535, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2989.2708892822266, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.1904735267162323, | |
| "kl": 0.0023894309997558594, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0001, | |
| "reward": 0.010273065650835633, | |
| "reward_std": 0.11918974481523037, | |
| "rewards/cosine_scaled_reward": -0.1677860009167489, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2901.5625228881836, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.18506519496440887, | |
| "kl": 0.003477931022644043, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08515205327421427, | |
| "reward_std": 0.14246997330337763, | |
| "rewards/cosine_scaled_reward": 0.07529417611658573, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3311.187530517578, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.22474369406700134, | |
| "kl": 0.0030045509338378906, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0001, | |
| "reward": -0.026094807864865288, | |
| "reward_std": 0.12371798837557435, | |
| "rewards/cosine_scaled_reward": -0.19261731766164303, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2830.0833587646484, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.18044407665729523, | |
| "kl": 0.0024716854095458984, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": -0.006927699316293001, | |
| "reward_std": 0.08703188924118876, | |
| "rewards/cosine_scaled_reward": -0.1649078167974949, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2923.0000228881836, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.22359904646873474, | |
| "kl": 0.0043888092041015625, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0002, | |
| "reward": 0.04467150871641934, | |
| "reward_std": 0.11750007548835129, | |
| "rewards/cosine_scaled_reward": -0.02377407788299024, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2468.4583587646484, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.20087915658950806, | |
| "kl": 0.0021352767944335938, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0001, | |
| "reward": 0.04917083401232958, | |
| "reward_std": 0.10673188930377364, | |
| "rewards/cosine_scaled_reward": -0.07255440950393677, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 2899.750030517578, | |
| "epoch": 0.152, | |
| "grad_norm": 0.2198459506034851, | |
| "kl": 0.0033211708068847656, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0001, | |
| "reward": 0.007418630411848426, | |
| "reward_std": 0.09727612743154168, | |
| "rewards/cosine_scaled_reward": -0.16582289477810264, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2317.9375228881836, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.19853614270687103, | |
| "kl": 0.004344940185546875, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09111709147691727, | |
| "reward_std": 0.11203813040629029, | |
| "rewards/cosine_scaled_reward": -0.02176826912909746, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1802.4791946411133, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.26218482851982117, | |
| "kl": 0.0051059722900390625, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0002, | |
| "reward": 0.19630146119743586, | |
| "reward_std": 0.15289967821445316, | |
| "rewards/cosine_scaled_reward": 0.23608196713030338, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2570.854248046875, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.20145438611507416, | |
| "kl": 0.0037708282470703125, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0002, | |
| "reward": 0.0829686057404615, | |
| "reward_std": 0.1752019147388637, | |
| "rewards/cosine_scaled_reward": -0.026120582595467567, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2661.2500381469727, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.19357463717460632, | |
| "kl": 0.0031414031982421875, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0001, | |
| "reward": 0.0038152660708874464, | |
| "reward_std": 0.10811280179768801, | |
| "rewards/cosine_scaled_reward": -0.21752969082444906, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2141.166702270508, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.2089914232492447, | |
| "kl": 0.002676725387573242, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.14542343048378825, | |
| "reward_std": 0.15844334475696087, | |
| "rewards/cosine_scaled_reward": 0.04235106392297894, | |
| "rewards/format_reward": 0.7708333618938923, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2820.8750610351562, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.18096813559532166, | |
| "kl": 0.0042743682861328125, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0002, | |
| "reward": 0.06168772419914603, | |
| "reward_std": 0.14358124835416675, | |
| "rewards/cosine_scaled_reward": -0.07845509238541126, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2436.354217529297, | |
| "epoch": 0.16, | |
| "grad_norm": 0.27698883414268494, | |
| "kl": 0.0058422088623046875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0002, | |
| "reward": 0.044038140535121784, | |
| "reward_std": 0.07816965272650123, | |
| "rewards/cosine_scaled_reward": -0.11026089265942574, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2359.375045776367, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.1908435970544815, | |
| "kl": 0.0043811798095703125, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0002, | |
| "reward": 0.03223781171254814, | |
| "reward_std": 0.11182020884007215, | |
| "rewards/cosine_scaled_reward": -0.1976433489471674, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2460.187530517578, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.1981201469898224, | |
| "kl": 0.0038270950317382812, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0002, | |
| "reward": 0.08847181824967265, | |
| "reward_std": 0.13877713168039918, | |
| "rewards/cosine_scaled_reward": -0.06368941674008965, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2437.791702270508, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.3880097270011902, | |
| "kl": 0.023929595947265625, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.001, | |
| "reward": 0.016975378792267293, | |
| "reward_std": 0.12177514331415296, | |
| "rewards/cosine_scaled_reward": -0.20094774826429784, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2708.916732788086, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.21965523064136505, | |
| "kl": 0.00417327880859375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0002, | |
| "reward": 0.030230441665480612, | |
| "reward_std": 0.17415813449770212, | |
| "rewards/cosine_scaled_reward": -0.11869808053597808, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 1839.4375228881836, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.2578926384449005, | |
| "kl": 0.004568576812744141, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15131515043321997, | |
| "reward_std": 0.15678277891129255, | |
| "rewards/cosine_scaled_reward": 0.0914019983028993, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 1990.5833892822266, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.19752271473407745, | |
| "kl": 0.003320455551147461, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09106699889525771, | |
| "reward_std": 0.10312592587433755, | |
| "rewards/cosine_scaled_reward": -0.1385479016462341, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 1966.604232788086, | |
| "epoch": 0.168, | |
| "grad_norm": 0.26733097434043884, | |
| "kl": 0.006213188171386719, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0002, | |
| "reward": 0.12648878013715148, | |
| "reward_std": 0.1677848151884973, | |
| "rewards/cosine_scaled_reward": 0.016043312381953, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 1654.6250457763672, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.2192990928888321, | |
| "kl": 0.004794120788574219, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09204956935718656, | |
| "reward_std": 0.08866735780611634, | |
| "rewards/cosine_scaled_reward": -0.13534213416278362, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2345.729232788086, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.2061564177274704, | |
| "kl": 0.0041484832763671875, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0002, | |
| "reward": 0.08000461710616946, | |
| "reward_std": 0.12683053640648723, | |
| "rewards/cosine_scaled_reward": -0.06624547764658928, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2006.6250381469727, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.24100010097026825, | |
| "kl": 0.00368499755859375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08540961390826851, | |
| "reward_std": 0.15422609634697437, | |
| "rewards/cosine_scaled_reward": -0.0530257155187428, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 1996.520881652832, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.28235289454460144, | |
| "kl": 0.005840301513671875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0002, | |
| "reward": 0.12831606157124043, | |
| "reward_std": 0.131237086141482, | |
| "rewards/cosine_scaled_reward": 0.031276384368538857, | |
| "rewards/format_reward": 0.6875000204890966, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2301.0208435058594, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.3784143626689911, | |
| "kl": 0.0074005126953125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0003, | |
| "reward": 0.025953251402825117, | |
| "reward_std": 0.11646852549165487, | |
| "rewards/cosine_scaled_reward": -0.18527186242863536, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2342.0834045410156, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.20556333661079407, | |
| "kl": 0.0073909759521484375, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0003, | |
| "reward": 0.03766886703670025, | |
| "reward_std": 0.08948473795317113, | |
| "rewards/cosine_scaled_reward": -0.19193847686983645, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3022.5833740234375, | |
| "epoch": 0.176, | |
| "grad_norm": 0.22437125444412231, | |
| "kl": 0.00362396240234375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06817820528522134, | |
| "reward_std": 0.16511888336390257, | |
| "rewards/cosine_scaled_reward": -0.018517197109758854, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2435.0625610351562, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.21803762018680573, | |
| "kl": 0.004116058349609375, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0002, | |
| "reward": 0.11520516406744719, | |
| "reward_std": 0.13615657854825258, | |
| "rewards/cosine_scaled_reward": 0.03556065930752084, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2320.583351135254, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.1894351989030838, | |
| "kl": 0.00403594970703125, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09686407796107233, | |
| "reward_std": 0.13217018358409405, | |
| "rewards/cosine_scaled_reward": 0.03549210913479328, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2527.5834197998047, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.2570294737815857, | |
| "kl": 0.005597114562988281, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0002, | |
| "reward": 0.046291103353723884, | |
| "reward_std": 0.09627518011257052, | |
| "rewards/cosine_scaled_reward": -0.15388192608952522, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2241.729248046875, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.21536891162395477, | |
| "kl": 0.0056514739990234375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15251322067342699, | |
| "reward_std": 0.13267278019338846, | |
| "rewards/cosine_scaled_reward": 0.06854456290602684, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 2231.7708587646484, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.1829807013273239, | |
| "kl": 0.004848480224609375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0002, | |
| "reward": 0.044310242868959904, | |
| "reward_std": 0.11434727860614657, | |
| "rewards/cosine_scaled_reward": -0.16176462545990944, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2064.312530517578, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.23337005078792572, | |
| "kl": 0.006267547607421875, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0003, | |
| "reward": 0.10007608711021021, | |
| "reward_std": 0.1771828606724739, | |
| "rewards/cosine_scaled_reward": -0.04113523324485868, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2270.937545776367, | |
| "epoch": 0.184, | |
| "grad_norm": 0.3289494812488556, | |
| "kl": 0.007640838623046875, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0003, | |
| "reward": 0.053364482591859996, | |
| "reward_std": 0.14938203245401382, | |
| "rewards/cosine_scaled_reward": -0.11477282957639545, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 2312.416702270508, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.26157301664352417, | |
| "kl": 0.008253097534179688, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0003, | |
| "reward": 0.04589501162990928, | |
| "reward_std": 0.12189697381108999, | |
| "rewards/cosine_scaled_reward": -0.13589882757514715, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2191.4583740234375, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.2897321283817291, | |
| "kl": 0.00688934326171875, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0003, | |
| "reward": 0.13259745202958584, | |
| "reward_std": 0.06983533198945224, | |
| "rewards/cosine_scaled_reward": 0.07808320969343185, | |
| "rewards/format_reward": 0.625, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 1521.0625610351562, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.2747092843055725, | |
| "kl": 0.009218215942382812, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0004, | |
| "reward": 0.12581492541357875, | |
| "reward_std": 0.09997158916667104, | |
| "rewards/cosine_scaled_reward": -0.03640655893832445, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 1848.2500228881836, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.275541216135025, | |
| "kl": 0.006542205810546875, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0003, | |
| "reward": 0.0523092825897038, | |
| "reward_std": 0.12433975096791983, | |
| "rewards/cosine_scaled_reward": -0.1790934158489108, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2049.0417251586914, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.1916612833738327, | |
| "kl": 0.0053234100341796875, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09825841523706913, | |
| "reward_std": 0.15324107883498073, | |
| "rewards/cosine_scaled_reward": -0.0768615622073412, | |
| "rewards/format_reward": 0.7291666734963655, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 1554.1666946411133, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.22473838925361633, | |
| "kl": 0.0043048858642578125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0002, | |
| "reward": 0.12430966179817915, | |
| "reward_std": 0.12945860624313354, | |
| "rewards/cosine_scaled_reward": -0.07466088375076652, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2357.5209045410156, | |
| "epoch": 0.192, | |
| "grad_norm": 0.218026265501976, | |
| "kl": 0.0063018798828125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0003, | |
| "reward": 0.0786146642640233, | |
| "reward_std": 0.14164651185274124, | |
| "rewards/cosine_scaled_reward": -0.08221817389130592, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1244.0625381469727, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.2136813998222351, | |
| "kl": 0.005023002624511719, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2644330468028784, | |
| "reward_std": 0.10949333664029837, | |
| "rewards/cosine_scaled_reward": 0.2890866380184889, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2086.333366394043, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.23367033898830414, | |
| "kl": 0.0049877166748046875, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0002, | |
| "reward": 0.13441222603432834, | |
| "reward_std": 0.12571046565426514, | |
| "rewards/cosine_scaled_reward": 0.07233863137662411, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2040.7708587646484, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.2201111614704132, | |
| "kl": 0.004878044128417969, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0002, | |
| "reward": 0.07799222506582737, | |
| "reward_std": 0.1219726437702775, | |
| "rewards/cosine_scaled_reward": -0.09505355032160878, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2090.562557220459, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.24928785860538483, | |
| "kl": 0.008022308349609375, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1555600226856768, | |
| "reward_std": 0.12234124867245555, | |
| "rewards/cosine_scaled_reward": 0.14208754245191813, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1355.5000228881836, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.2922389805316925, | |
| "kl": 0.00618743896484375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0002, | |
| "reward": 0.07720496540423483, | |
| "reward_std": 0.08944753208197653, | |
| "rewards/cosine_scaled_reward": -0.16825812682509422, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 1297.6042098999023, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.21333926916122437, | |
| "kl": 0.008691787719726562, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1351238526403904, | |
| "reward_std": 0.09852686384692788, | |
| "rewards/cosine_scaled_reward": -0.0807552793994546, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 1959.2708892822266, | |
| "epoch": 0.2, | |
| "grad_norm": 0.22018226981163025, | |
| "kl": 0.006298065185546875, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14479633374139667, | |
| "reward_std": 0.0969647653400898, | |
| "rewards/cosine_scaled_reward": 0.06122639961540699, | |
| "rewards/format_reward": 0.7291666753590107, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1673.6875381469727, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.2576465308666229, | |
| "kl": 0.0056438446044921875, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0002, | |
| "reward": 0.0873019965365529, | |
| "reward_std": 0.11411390919238329, | |
| "rewards/cosine_scaled_reward": -0.12017827155068517, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2214.270881652832, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.28319552540779114, | |
| "kl": 0.0062694549560546875, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0003, | |
| "reward": 0.08295768650714308, | |
| "reward_std": 0.10326328268274665, | |
| "rewards/cosine_scaled_reward": -0.09064689744263887, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 1769.8542022705078, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.28282982110977173, | |
| "kl": 0.00852203369140625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1297212722711265, | |
| "reward_std": 0.15833280980587006, | |
| "rewards/cosine_scaled_reward": 0.026437816210091114, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2106.7291717529297, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.21418659389019012, | |
| "kl": 0.00591278076171875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0002, | |
| "reward": 0.051371646230109036, | |
| "reward_std": 0.10319430893287063, | |
| "rewards/cosine_scaled_reward": -0.15093043667729944, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 1345.2083778381348, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.28628435730934143, | |
| "kl": 0.00799560546875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2407036293298006, | |
| "reward_std": 0.17751038947608322, | |
| "rewards/cosine_scaled_reward": 0.25933289900422096, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 1969.208381652832, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.24428333342075348, | |
| "kl": 0.00662994384765625, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0003, | |
| "reward": 0.09334775037132204, | |
| "reward_std": 0.12076092883944511, | |
| "rewards/cosine_scaled_reward": -0.08935475163161755, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 1827.2291946411133, | |
| "epoch": 0.208, | |
| "grad_norm": 0.19231781363487244, | |
| "kl": 0.0035953521728515625, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10479269758798182, | |
| "reward_std": 0.11822887184098363, | |
| "rewards/cosine_scaled_reward": -0.06516919657588005, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1392.8125457763672, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.264604389667511, | |
| "kl": 0.009449005126953125, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1556489015929401, | |
| "reward_std": 0.10147193586453795, | |
| "rewards/cosine_scaled_reward": 0.019236549735069275, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 1635.062551498413, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.2975788414478302, | |
| "kl": 0.00707244873046875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0003, | |
| "reward": 0.0682187182828784, | |
| "reward_std": 0.0552776656113565, | |
| "rewards/cosine_scaled_reward": -0.20538464561104774, | |
| "rewards/format_reward": 0.8125000018626451, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 1627.1250686645508, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.26065441966056824, | |
| "kl": 0.0062122344970703125, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0002, | |
| "reward": 0.06265506497584283, | |
| "reward_std": 0.11735227680765092, | |
| "rewards/cosine_scaled_reward": -0.19095464050769806, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 1840.1458587646484, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.2078036367893219, | |
| "kl": 0.007350921630859375, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0003, | |
| "reward": 0.09769681794568896, | |
| "reward_std": 0.09951704926788807, | |
| "rewards/cosine_scaled_reward": -0.06353580858558416, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1557.5416870117188, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.44674429297447205, | |
| "kl": 0.008470535278320312, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0003, | |
| "reward": 0.09892327198758721, | |
| "reward_std": 0.0855756844393909, | |
| "rewards/cosine_scaled_reward": -0.1240589041262865, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 2297.3333587646484, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.2045762985944748, | |
| "kl": 0.008823394775390625, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0004, | |
| "reward": 0.05021746223792434, | |
| "reward_std": 0.06788925174623728, | |
| "rewards/cosine_scaled_reward": -0.15488196723163128, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 1355.6875381469727, | |
| "epoch": 0.216, | |
| "grad_norm": 0.29116666316986084, | |
| "kl": 0.0076541900634765625, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0003, | |
| "reward": 0.07565085194073617, | |
| "reward_std": 0.08206533431075513, | |
| "rewards/cosine_scaled_reward": -0.19497415097430348, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 1135.4167022705078, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.2219790816307068, | |
| "kl": 0.006549835205078125, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0003, | |
| "reward": 0.15915191872045398, | |
| "reward_std": 0.08802533126436174, | |
| "rewards/cosine_scaled_reward": -0.022108266479335725, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1071.7708587646484, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.26136839389801025, | |
| "kl": 0.006053924560546875, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0002, | |
| "reward": 0.20304076466709375, | |
| "reward_std": 0.12852966412901878, | |
| "rewards/cosine_scaled_reward": 0.10831178847001866, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 1712.4583740234375, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.20474177598953247, | |
| "kl": 0.006473541259765625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0003, | |
| "reward": 0.09229559730738401, | |
| "reward_std": 0.1016106829047203, | |
| "rewards/cosine_scaled_reward": -0.1470730509608984, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2210.395851135254, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 1.7929117679595947, | |
| "kl": 0.055267333984375, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0022, | |
| "reward": 0.03142698993906379, | |
| "reward_std": 0.12175118830054998, | |
| "rewards/cosine_scaled_reward": -0.18109365366399288, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2227.291717529297, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.21808066964149475, | |
| "kl": 0.00843048095703125, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1474433816038072, | |
| "reward_std": 0.17017615539953113, | |
| "rewards/cosine_scaled_reward": 0.07256746315397322, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1330.1458435058594, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.2461828738451004, | |
| "kl": 0.008304595947265625, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0003, | |
| "reward": 0.13772209081798792, | |
| "reward_std": 0.13301934953778982, | |
| "rewards/cosine_scaled_reward": -0.04500130284577608, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2083.5000762939453, | |
| "epoch": 0.224, | |
| "grad_norm": 0.2871836721897125, | |
| "kl": 0.0097198486328125, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0004, | |
| "reward": 0.11253911699168384, | |
| "reward_std": 0.1284322296269238, | |
| "rewards/cosine_scaled_reward": -0.02314686682075262, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1072.708366394043, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.28244540095329285, | |
| "kl": 0.006664276123046875, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14220814127475023, | |
| "reward_std": 0.1526435911655426, | |
| "rewards/cosine_scaled_reward": -0.0398729182779789, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 1305.8750305175781, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.3013690710067749, | |
| "kl": 0.009075164794921875, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1608109144726768, | |
| "reward_std": 0.13874499686062336, | |
| "rewards/cosine_scaled_reward": 0.03152369521558285, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 1484.0000457763672, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.21533454954624176, | |
| "kl": 0.0067596435546875, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0003, | |
| "reward": 0.11915545212104917, | |
| "reward_std": 0.10307862563058734, | |
| "rewards/cosine_scaled_reward": -0.1427280263742432, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1291.8750228881836, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.2203797996044159, | |
| "kl": 0.008068084716796875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1579569444875233, | |
| "reward_std": 0.1443558344617486, | |
| "rewards/cosine_scaled_reward": -0.0025847081560641527, | |
| "rewards/format_reward": 0.9375, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1646.3958892822266, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.24083541333675385, | |
| "kl": 0.00879669189453125, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0004, | |
| "reward": 0.21994818467646837, | |
| "reward_std": 0.11868779285578057, | |
| "rewards/cosine_scaled_reward": 0.23895522952079773, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1287.6041946411133, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.3120401203632355, | |
| "kl": 0.016357421875, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0007, | |
| "reward": 0.21438546478748322, | |
| "reward_std": 0.06259978096932173, | |
| "rewards/cosine_scaled_reward": 0.1961576696485281, | |
| "rewards/format_reward": 0.875, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1295.270896911621, | |
| "epoch": 0.232, | |
| "grad_norm": 0.25308915972709656, | |
| "kl": 0.008411407470703125, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0003, | |
| "reward": 0.16257827286608517, | |
| "reward_std": 0.13092664163559675, | |
| "rewards/cosine_scaled_reward": 0.02888377010822296, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1253.0417022705078, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.36675336956977844, | |
| "kl": 0.01125335693359375, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0005, | |
| "reward": 0.19385905005037785, | |
| "reward_std": 0.12034193379804492, | |
| "rewards/cosine_scaled_reward": 0.07674635015428066, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1501.5417251586914, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.2521401345729828, | |
| "kl": 0.00830841064453125, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0003, | |
| "reward": 0.24430920276790857, | |
| "reward_std": 0.17064828611910343, | |
| "rewards/cosine_scaled_reward": 0.26697138883173466, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 1941.3958740234375, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.30331704020500183, | |
| "kl": 0.00870513916015625, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0003, | |
| "reward": 0.045394688844680786, | |
| "reward_std": 0.09498168341815472, | |
| "rewards/cosine_scaled_reward": -0.24192016012966633, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1438.8333587646484, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.31282100081443787, | |
| "kl": 0.0115203857421875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0005, | |
| "reward": 0.11734095495194197, | |
| "reward_std": 0.1474492819979787, | |
| "rewards/cosine_scaled_reward": -0.08522756304591894, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1113.7500228881836, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.2624795734882355, | |
| "kl": 0.008823394775390625, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0004, | |
| "reward": 0.17374667339026928, | |
| "reward_std": 0.10973423393443227, | |
| "rewards/cosine_scaled_reward": 0.023771056905388832, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1030.7708625793457, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.32483094930648804, | |
| "kl": 0.00803375244140625, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0003, | |
| "reward": 0.18928107433021069, | |
| "reward_std": 0.10979128838516772, | |
| "rewards/cosine_scaled_reward": 0.07668027561157942, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1972.9167022705078, | |
| "epoch": 0.24, | |
| "grad_norm": 0.17738643288612366, | |
| "kl": 0.008514404296875, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14470667950809002, | |
| "reward_std": 0.13229582412168384, | |
| "rewards/cosine_scaled_reward": 0.037202537059783936, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1598.7500228881836, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.27847760915756226, | |
| "kl": 0.011180877685546875, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0004, | |
| "reward": 0.14864719624165446, | |
| "reward_std": 0.11434817017288879, | |
| "rewards/cosine_scaled_reward": 0.04204285331070423, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1177.6875190734863, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.28754156827926636, | |
| "kl": 0.01055145263671875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0004, | |
| "reward": 0.17801955621689558, | |
| "reward_std": 0.10441821068525314, | |
| "rewards/cosine_scaled_reward": 0.1050565280020237, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1114.354190826416, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.4188780188560486, | |
| "kl": 0.011600494384765625, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0005, | |
| "reward": 0.17088149162009358, | |
| "reward_std": 0.11700731026940048, | |
| "rewards/cosine_scaled_reward": 0.030954405665397644, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 1921.0833587646484, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.22250115871429443, | |
| "kl": 0.010768890380859375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0004, | |
| "reward": 0.165664148516953, | |
| "reward_std": 0.15507009578868747, | |
| "rewards/cosine_scaled_reward": 0.08362742932513356, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1409.3333587646484, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.26721227169036865, | |
| "kl": 0.007991790771484375, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0003, | |
| "reward": 0.11459713708609343, | |
| "reward_std": 0.10593406949192286, | |
| "rewards/cosine_scaled_reward": -0.08912589284591377, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1280.6042175292969, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.31878677010536194, | |
| "kl": 0.010250091552734375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0004, | |
| "reward": 0.19101236946880817, | |
| "reward_std": 0.11184050468727946, | |
| "rewards/cosine_scaled_reward": 0.12221446633338928, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1485.3541870117188, | |
| "epoch": 0.248, | |
| "grad_norm": 0.24682621657848358, | |
| "kl": 0.00894927978515625, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1820166790857911, | |
| "reward_std": 0.16682014428079128, | |
| "rewards/cosine_scaled_reward": 0.11264388589188457, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1584.9583740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.28223466873168945, | |
| "kl": 0.010005950927734375, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0004, | |
| "reward": 0.11426169364131056, | |
| "reward_std": 0.1477383803576231, | |
| "rewards/cosine_scaled_reward": -0.10390518826898187, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1366.0625305175781, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.3087660074234009, | |
| "kl": 0.0101165771484375, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0004, | |
| "reward": 0.162561041302979, | |
| "reward_std": 0.15994944656267762, | |
| "rewards/cosine_scaled_reward": 0.03411710192449391, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1414.1666717529297, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.30027467012405396, | |
| "kl": 0.007183074951171875, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0003, | |
| "reward": 0.04776344425044954, | |
| "reward_std": 0.06931147351861, | |
| "rewards/cosine_scaled_reward": -0.2924546115100384, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1363.2292022705078, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.32602623105049133, | |
| "kl": 0.00916290283203125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0004, | |
| "reward": 0.17115566816937644, | |
| "reward_std": 0.10846680961549282, | |
| "rewards/cosine_scaled_reward": 0.0650689210742712, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1461.8542022705078, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.2890350818634033, | |
| "kl": 0.009174346923828125, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1370222427067347, | |
| "reward_std": 0.13589494908228517, | |
| "rewards/cosine_scaled_reward": -0.022166259586811066, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1474.7500381469727, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.22688108682632446, | |
| "kl": 0.008312225341796875, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1485723494552076, | |
| "reward_std": 0.10952452756464481, | |
| "rewards/cosine_scaled_reward": 0.029805000871419907, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 1974.1458892822266, | |
| "epoch": 0.256, | |
| "grad_norm": 0.24594879150390625, | |
| "kl": 0.008819580078125, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0004, | |
| "reward": 0.11885308753699064, | |
| "reward_std": 0.1231303745880723, | |
| "rewards/cosine_scaled_reward": -0.07305323181208223, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 1713.7500305175781, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.2284773290157318, | |
| "kl": 0.014232635498046875, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0006, | |
| "reward": 0.10374407912604511, | |
| "reward_std": 0.14682644978165627, | |
| "rewards/cosine_scaled_reward": -0.10095677326899022, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1503.7708740234375, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.23106862604618073, | |
| "kl": 0.00830841064453125, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0003, | |
| "reward": 0.18740362441167235, | |
| "reward_std": 0.08802895061671734, | |
| "rewards/cosine_scaled_reward": 0.10540169104933739, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1244.1875534057617, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.2707884907722473, | |
| "kl": 0.013683319091796875, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0005, | |
| "reward": 0.15233041066676378, | |
| "reward_std": 0.11700486252084374, | |
| "rewards/cosine_scaled_reward": -0.04158254712820053, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1151.6666870117188, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.31404733657836914, | |
| "kl": 0.00904083251953125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1969418814405799, | |
| "reward_std": 0.11795947467908263, | |
| "rewards/cosine_scaled_reward": 0.13468213769374415, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1607.1875610351562, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.3030518889427185, | |
| "kl": 0.011165618896484375, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0004, | |
| "reward": 0.11387707642279565, | |
| "reward_std": 0.0794507262762636, | |
| "rewards/cosine_scaled_reward": -0.06163225881755352, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 1865.7292022705078, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.26728665828704834, | |
| "kl": 0.009771347045898438, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0004, | |
| "reward": 0.06328508502338082, | |
| "reward_std": 0.11280694883316755, | |
| "rewards/cosine_scaled_reward": -0.1895492672920227, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1559.5625610351562, | |
| "epoch": 0.264, | |
| "grad_norm": 0.25874829292297363, | |
| "kl": 0.009349822998046875, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0004, | |
| "reward": 0.22241820394992828, | |
| "reward_std": 0.11675288947299123, | |
| "rewards/cosine_scaled_reward": 0.19621967896819115, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 1832.2291870117188, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.2535684406757355, | |
| "kl": 0.011371612548828125, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0005, | |
| "reward": 0.04174727539066225, | |
| "reward_std": 0.08498770324513316, | |
| "rewards/cosine_scaled_reward": -0.2521397266536951, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1139.8958587646484, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.24799145758152008, | |
| "kl": 0.00756072998046875, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0003, | |
| "reward": 0.10831367457285523, | |
| "reward_std": 0.10008358396589756, | |
| "rewards/cosine_scaled_reward": -0.16862575709819794, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 1544.5000305175781, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.2703922986984253, | |
| "kl": 0.0122222900390625, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0005, | |
| "reward": 0.13072135020047426, | |
| "reward_std": 0.12703408766537905, | |
| "rewards/cosine_scaled_reward": -0.010831212624907494, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1198.8125267028809, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.33897149562835693, | |
| "kl": 0.01023101806640625, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0004, | |
| "reward": 0.22266705462243408, | |
| "reward_std": 0.13369846408022568, | |
| "rewards/cosine_scaled_reward": 0.17654232122004032, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1820.2500457763672, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.22755910456180573, | |
| "kl": 0.008855819702148438, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0004, | |
| "reward": 0.10096835857257247, | |
| "reward_std": 0.12764764530584216, | |
| "rewards/cosine_scaled_reward": -0.10865432699210942, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1567.0208587646484, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.23869018256664276, | |
| "kl": 0.010875701904296875, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1583275799639523, | |
| "reward_std": 0.07935558445751667, | |
| "rewards/cosine_scaled_reward": 0.06267662812024355, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1344.5625305175781, | |
| "epoch": 0.272, | |
| "grad_norm": 0.23950256407260895, | |
| "kl": 0.01043701171875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0004, | |
| "reward": 0.2100483477115631, | |
| "reward_std": 0.10991842532530427, | |
| "rewards/cosine_scaled_reward": 0.15677512856200337, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1426.041690826416, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.2463754415512085, | |
| "kl": 0.0074310302734375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0003, | |
| "reward": 0.21384657500311732, | |
| "reward_std": 0.08286083268467337, | |
| "rewards/cosine_scaled_reward": 0.23679617792367935, | |
| "rewards/format_reward": 0.770833333954215, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1488.8958587646484, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.4307219088077545, | |
| "kl": 0.0166473388671875, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0007, | |
| "reward": 0.043208114642766304, | |
| "reward_std": 0.10311023378744721, | |
| "rewards/cosine_scaled_reward": -0.24894601851701736, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1405.2916870117188, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.42230215668678284, | |
| "kl": 0.013263702392578125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0005, | |
| "reward": 0.05663964038831182, | |
| "reward_std": 0.10991012584418058, | |
| "rewards/cosine_scaled_reward": -0.2536990698426962, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1111.4375457763672, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.365119606256485, | |
| "kl": 0.018003463745117188, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0007, | |
| "reward": 0.12744820569059812, | |
| "reward_std": 0.1108874985948205, | |
| "rewards/cosine_scaled_reward": -0.08098506298847497, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1557.8541870117188, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.22312189638614655, | |
| "kl": 0.0106353759765625, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1432976769283414, | |
| "reward_std": 0.11755505437031388, | |
| "rewards/cosine_scaled_reward": 0.002848621690645814, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1817.9167022705078, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.284006804227829, | |
| "kl": 0.01305389404296875, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0005, | |
| "reward": 0.16071391198784113, | |
| "reward_std": 0.11166157713159919, | |
| "rewards/cosine_scaled_reward": 0.0663528572767973, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1842.479232788086, | |
| "epoch": 0.28, | |
| "grad_norm": 0.26421064138412476, | |
| "kl": 0.01108551025390625, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0004, | |
| "reward": 0.15495485439896584, | |
| "reward_std": 0.14859340619295835, | |
| "rewards/cosine_scaled_reward": 0.060377851128578186, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1367.7292098999023, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.257500022649765, | |
| "kl": 0.01171112060546875, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0005, | |
| "reward": 0.1178839597851038, | |
| "reward_std": 0.11170466430485249, | |
| "rewards/cosine_scaled_reward": -0.12124981544911861, | |
| "rewards/format_reward": 0.9375, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 2025.4167175292969, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.2877947986125946, | |
| "kl": 0.014371871948242188, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0006, | |
| "reward": 0.041426120849791914, | |
| "reward_std": 0.10642113536596298, | |
| "rewards/cosine_scaled_reward": -0.20261837355792522, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1317.4167022705078, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.3394499123096466, | |
| "kl": 0.012571334838867188, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0005, | |
| "reward": 0.16258022841066122, | |
| "reward_std": 0.09610571060329676, | |
| "rewards/cosine_scaled_reward": 0.07010693056508899, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1280.3125305175781, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.27582064270973206, | |
| "kl": 0.011753082275390625, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0005, | |
| "reward": 0.21597624802961946, | |
| "reward_std": 0.11335421586409211, | |
| "rewards/cosine_scaled_reward": 0.18638860061764717, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1112.1875381469727, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.3718562424182892, | |
| "kl": 0.01335906982421875, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0005, | |
| "reward": 0.1376856635324657, | |
| "reward_std": 0.06630115583539009, | |
| "rewards/cosine_scaled_reward": -0.09413989027962089, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 969.0625152587891, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.4498876631259918, | |
| "kl": 0.01354217529296875, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0005, | |
| "reward": 0.13784294662764296, | |
| "reward_std": 0.11408048821613193, | |
| "rewards/cosine_scaled_reward": -0.029557042755186558, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1556.2500534057617, | |
| "epoch": 0.288, | |
| "grad_norm": 0.35213375091552734, | |
| "kl": 0.01563262939453125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0006, | |
| "reward": 0.10068062460049987, | |
| "reward_std": 0.08584798872470856, | |
| "rewards/cosine_scaled_reward": -0.10943282302469015, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1594.7292175292969, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.36279869079589844, | |
| "kl": 0.019073486328125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0008, | |
| "reward": 0.10339808277785778, | |
| "reward_std": 0.14174744859337807, | |
| "rewards/cosine_scaled_reward": -0.09498709812760353, | |
| "rewards/format_reward": 0.7916666734963655, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1617.0417022705078, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.37625786662101746, | |
| "kl": 0.016139984130859375, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0006, | |
| "reward": 0.06562280771322548, | |
| "reward_std": 0.09647063678130507, | |
| "rewards/cosine_scaled_reward": -0.20810320507735014, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 1775.500015258789, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.30141302943229675, | |
| "kl": 0.012451171875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0005, | |
| "reward": 0.03216266352683306, | |
| "reward_std": 0.09485536953434348, | |
| "rewards/cosine_scaled_reward": -0.2940612696111202, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1419.5833740234375, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.3471844494342804, | |
| "kl": 0.0146484375, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0006, | |
| "reward": 0.15079816803336143, | |
| "reward_std": 0.10951651586219668, | |
| "rewards/cosine_scaled_reward": -0.006679622456431389, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 1932.9583892822266, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.22311758995056152, | |
| "kl": 0.01297760009765625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0005, | |
| "reward": 0.20792145188897848, | |
| "reward_std": 0.12192836869508028, | |
| "rewards/cosine_scaled_reward": 0.19739503040909767, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1696.458366394043, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.3503570556640625, | |
| "kl": 0.012058258056640625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0005, | |
| "reward": 0.111656179651618, | |
| "reward_std": 0.14000094681978226, | |
| "rewards/cosine_scaled_reward": -0.06675757747143507, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1482.2292022705078, | |
| "epoch": 0.296, | |
| "grad_norm": 0.38044440746307373, | |
| "kl": 0.016082763671875, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0006, | |
| "reward": 0.14132701233029366, | |
| "reward_std": 0.09021098469384015, | |
| "rewards/cosine_scaled_reward": 0.016738089732825756, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1308.8958930969238, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.3778381943702698, | |
| "kl": 0.0111083984375, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0004, | |
| "reward": 0.20748187974095345, | |
| "reward_std": 0.1426714597037062, | |
| "rewards/cosine_scaled_reward": 0.14869027212262154, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2185.395866394043, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.29314514994621277, | |
| "kl": 0.019428253173828125, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0008, | |
| "reward": 0.05902714841067791, | |
| "reward_std": 0.09425672655925155, | |
| "rewards/cosine_scaled_reward": -0.1403571031987667, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1879.7917175292969, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.46697157621383667, | |
| "kl": 0.020751953125, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0008, | |
| "reward": 0.04076826642267406, | |
| "reward_std": 0.10802473686635494, | |
| "rewards/cosine_scaled_reward": -0.2183469645678997, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1427.145881652832, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.2928243577480316, | |
| "kl": 0.0118255615234375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0005, | |
| "reward": 0.05615242966450751, | |
| "reward_std": 0.06438650703057647, | |
| "rewards/cosine_scaled_reward": -0.2520607812330127, | |
| "rewards/format_reward": 0.8333333488553762, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1357.7292251586914, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.3987843096256256, | |
| "kl": 0.012592315673828125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0005, | |
| "reward": 0.11286137904971838, | |
| "reward_std": 0.13379723951220512, | |
| "rewards/cosine_scaled_reward": -0.10757828690111637, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1381.5000381469727, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.3415836989879608, | |
| "kl": 0.0178070068359375, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0007, | |
| "reward": 0.1456056940369308, | |
| "reward_std": 0.14075131434947252, | |
| "rewards/cosine_scaled_reward": -0.009397267829626799, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1806.1250610351562, | |
| "epoch": 0.304, | |
| "grad_norm": 0.38759687542915344, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0006, | |
| "reward": 0.1108898997772485, | |
| "reward_std": 0.11440710537135601, | |
| "rewards/cosine_scaled_reward": -0.08643979020416737, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2056.729202270508, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.42212945222854614, | |
| "kl": 0.026676177978515625, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0011, | |
| "reward": 0.022361958224792033, | |
| "reward_std": 0.08537128940224648, | |
| "rewards/cosine_scaled_reward": -0.24759646970778704, | |
| "rewards/format_reward": 0.6250000167638063, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1231.9166870117188, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.5157231092453003, | |
| "kl": 0.02033233642578125, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0008, | |
| "reward": 0.0841333303033025, | |
| "reward_std": 0.10015194956213236, | |
| "rewards/cosine_scaled_reward": -0.18771476298570633, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1421.8541870117188, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.3175782859325409, | |
| "kl": 0.015338897705078125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0006, | |
| "reward": 0.0848088227212429, | |
| "reward_std": 0.10533672664314508, | |
| "rewards/cosine_scaled_reward": -0.1579919238574803, | |
| "rewards/format_reward": 0.8125, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1852.9792251586914, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.31283366680145264, | |
| "kl": 0.020725250244140625, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0008, | |
| "reward": 0.1707673908676952, | |
| "reward_std": 0.14836188638582826, | |
| "rewards/cosine_scaled_reward": 0.11553415982052684, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1270.8333625793457, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.4045168459415436, | |
| "kl": 0.020198822021484375, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0008, | |
| "reward": 0.2056138776242733, | |
| "reward_std": 0.1318775275722146, | |
| "rewards/cosine_scaled_reward": 0.1646625578578096, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 1756.8542098999023, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.34620898962020874, | |
| "kl": 0.02004241943359375, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0008, | |
| "reward": 0.12157785186354886, | |
| "reward_std": 0.1287962575443089, | |
| "rewards/cosine_scaled_reward": -0.04085011733695865, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1281.5000381469727, | |
| "epoch": 0.312, | |
| "grad_norm": 0.4477139115333557, | |
| "kl": 0.0211029052734375, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0008, | |
| "reward": 0.1495568435639143, | |
| "reward_std": 0.1047550356015563, | |
| "rewards/cosine_scaled_reward": 0.03370687458664179, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1047.6250457763672, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.38608211278915405, | |
| "kl": 0.015705108642578125, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0006, | |
| "reward": 0.16697457217378542, | |
| "reward_std": 0.1347856274805963, | |
| "rewards/cosine_scaled_reward": 0.032155007666005986, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 1519.5625534057617, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.41990184783935547, | |
| "kl": 0.02295684814453125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0009, | |
| "reward": 0.19299401948228478, | |
| "reward_std": 0.1319624213501811, | |
| "rewards/cosine_scaled_reward": 0.15465315023902804, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1206.583351135254, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.38451164960861206, | |
| "kl": 0.020355224609375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0008, | |
| "reward": 0.14626443712040782, | |
| "reward_std": 0.13966891495510936, | |
| "rewards/cosine_scaled_reward": 0.004510253042099066, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1248.2917022705078, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.5081427097320557, | |
| "kl": 0.02585601806640625, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.001, | |
| "reward": 0.14069768914487213, | |
| "reward_std": 0.12813527416437864, | |
| "rewards/cosine_scaled_reward": -0.05632503447122872, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1363.895866394043, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.5567955374717712, | |
| "kl": 0.02330780029296875, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0009, | |
| "reward": 0.1985365085711237, | |
| "reward_std": 0.08099143509753048, | |
| "rewards/cosine_scaled_reward": 0.12419202888850123, | |
| "rewards/format_reward": 0.875, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1633.9583740234375, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.42431047558784485, | |
| "kl": 0.035366058349609375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0014, | |
| "reward": 0.10419091582298279, | |
| "reward_std": 0.15187329379841685, | |
| "rewards/cosine_scaled_reward": -0.07383611425757408, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1602.2292251586914, | |
| "epoch": 0.32, | |
| "grad_norm": 0.3582462668418884, | |
| "kl": 0.042247772216796875, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0017, | |
| "reward": 0.20170635590329766, | |
| "reward_std": 0.15067243855446577, | |
| "rewards/cosine_scaled_reward": 0.19395790994167328, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2467.0416717529297, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.29816770553588867, | |
| "kl": 0.05840301513671875, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0023, | |
| "reward": 0.05792845832183957, | |
| "reward_std": 0.14697478245943785, | |
| "rewards/cosine_scaled_reward": -0.10388723947107792, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1241.5416946411133, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.46031466126441956, | |
| "kl": 0.0266265869140625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0011, | |
| "reward": 0.13118357677012682, | |
| "reward_std": 0.13791329925879836, | |
| "rewards/cosine_scaled_reward": -0.02069989638403058, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 1868.6250228881836, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.32365959882736206, | |
| "kl": 0.052989959716796875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0021, | |
| "reward": 0.18105492927134037, | |
| "reward_std": 0.13572940416634083, | |
| "rewards/cosine_scaled_reward": 0.16773403156548738, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1280.8542098999023, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.4519507586956024, | |
| "kl": 0.019351959228515625, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0008, | |
| "reward": 0.11721091519575566, | |
| "reward_std": 0.15023711137473583, | |
| "rewards/cosine_scaled_reward": -0.09081541141495109, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1140.1250305175781, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.7309654951095581, | |
| "kl": 0.0377349853515625, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0015, | |
| "reward": 0.09533977252431214, | |
| "reward_std": 0.11678969115018845, | |
| "rewards/cosine_scaled_reward": -0.12558545544743538, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1449.3958740234375, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.49403443932533264, | |
| "kl": 0.0523681640625, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0021, | |
| "reward": 0.1131226432044059, | |
| "reward_std": 0.12507515028119087, | |
| "rewards/cosine_scaled_reward": -0.08685349836014211, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1521.6250457763672, | |
| "epoch": 0.328, | |
| "grad_norm": 0.858707070350647, | |
| "kl": 0.069488525390625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0028, | |
| "reward": 0.11133736907504499, | |
| "reward_std": 0.11151464702561498, | |
| "rewards/cosine_scaled_reward": -0.03888722602277994, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 1589.6458892822266, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.5665430426597595, | |
| "kl": 0.059162139892578125, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0024, | |
| "reward": 0.11726528691360727, | |
| "reward_std": 0.11230942467227578, | |
| "rewards/cosine_scaled_reward": -0.06808646989520639, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 1320.7916831970215, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.8541005849838257, | |
| "kl": 0.035400390625, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0014, | |
| "reward": 0.12517158885020763, | |
| "reward_std": 0.07708289241418242, | |
| "rewards/cosine_scaled_reward": -0.030706046149134636, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1165.520866394043, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.45539847016334534, | |
| "kl": 0.050296783447265625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.002, | |
| "reward": 0.1758332857862115, | |
| "reward_std": 0.14369960315525532, | |
| "rewards/cosine_scaled_reward": 0.06662235781550407, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1888.7292556762695, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.4208749234676361, | |
| "kl": 0.10372543334960938, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0041, | |
| "reward": 0.15313149709254503, | |
| "reward_std": 0.14885053224861622, | |
| "rewards/cosine_scaled_reward": 0.030359832802787423, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 1906.958381652832, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 1.015468716621399, | |
| "kl": 0.11415481567382812, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0046, | |
| "reward": 0.08511171862483025, | |
| "reward_std": 0.11772025120444596, | |
| "rewards/cosine_scaled_reward": -0.146739911288023, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 1338.8541946411133, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.5531266331672668, | |
| "kl": 0.0963287353515625, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0039, | |
| "reward": 0.14025127002969384, | |
| "reward_std": 0.12792299408465624, | |
| "rewards/cosine_scaled_reward": -0.01351697463542223, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 1772.7291870117188, | |
| "epoch": 0.336, | |
| "grad_norm": 0.9273716807365417, | |
| "kl": 0.05928802490234375, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0024, | |
| "reward": 0.14557143417187035, | |
| "reward_std": 0.13822759315371513, | |
| "rewards/cosine_scaled_reward": 0.04577969899401069, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 1621.5833587646484, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.7160328030586243, | |
| "kl": 0.06582260131835938, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0026, | |
| "reward": 0.15534777799621224, | |
| "reward_std": 0.12278164038434625, | |
| "rewards/cosine_scaled_reward": 0.053630582988262177, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 1988.916732788086, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 1.5293620824813843, | |
| "kl": 0.130126953125, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0052, | |
| "reward": 0.07497212127782404, | |
| "reward_std": 0.15273468242958188, | |
| "rewards/cosine_scaled_reward": -0.14556175749748945, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2342.8959197998047, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 1.5157654285430908, | |
| "kl": 0.10904693603515625, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0044, | |
| "reward": 0.08573226723819971, | |
| "reward_std": 0.14976172288879752, | |
| "rewards/cosine_scaled_reward": -0.05238574789837003, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 1915.416732788086, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.8163977861404419, | |
| "kl": 0.1429595947265625, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0057, | |
| "reward": 0.10191577160730958, | |
| "reward_std": 0.10441284067928791, | |
| "rewards/cosine_scaled_reward": -0.054123382084071636, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 1942.7084045410156, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 1.951476812362671, | |
| "kl": 0.17220306396484375, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0069, | |
| "reward": 0.09106689637701493, | |
| "reward_std": 0.11453963397070765, | |
| "rewards/cosine_scaled_reward": -0.08032999746501446, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 2121.520881652832, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.9551680684089661, | |
| "kl": 0.1997222900390625, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.008, | |
| "reward": 0.09341209102421999, | |
| "reward_std": 0.11039053509011865, | |
| "rewards/cosine_scaled_reward": -0.07323687896132469, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2006.5209007263184, | |
| "epoch": 0.344, | |
| "grad_norm": 1.1690226793289185, | |
| "kl": 0.244354248046875, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0098, | |
| "reward": 0.07932836120016873, | |
| "reward_std": 0.13190083391964436, | |
| "rewards/cosine_scaled_reward": -0.11513608321547508, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1556.2917137145996, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 1.1125872135162354, | |
| "kl": 0.10612106323242188, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0042, | |
| "reward": 0.15471726842224598, | |
| "reward_std": 0.10859864950180054, | |
| "rewards/cosine_scaled_reward": 0.06700704153627157, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1470.5625267028809, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.939318060874939, | |
| "kl": 0.0955810546875, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0038, | |
| "reward": 0.16825311817228794, | |
| "reward_std": 0.13590177986770868, | |
| "rewards/cosine_scaled_reward": 0.02981522586196661, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1612.6458740234375, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 1.2758152484893799, | |
| "kl": 0.13702392578125, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0055, | |
| "reward": 0.08096132357604802, | |
| "reward_std": 0.10056991688907146, | |
| "rewards/cosine_scaled_reward": -0.1730497945100069, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1444.104232788086, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 1.4721267223358154, | |
| "kl": 0.09059906005859375, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0036, | |
| "reward": 0.10656621214002371, | |
| "reward_std": 0.13656832091510296, | |
| "rewards/cosine_scaled_reward": -0.12406391743570566, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1328.6458740234375, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 1.2772217988967896, | |
| "kl": 0.14186859130859375, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0057, | |
| "reward": 0.19892889651237056, | |
| "reward_std": 0.11784483585506678, | |
| "rewards/cosine_scaled_reward": 0.14353829622268677, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1276.645866394043, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 1.0841631889343262, | |
| "kl": 0.130828857421875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0052, | |
| "reward": 0.12190989218652248, | |
| "reward_std": 0.12431731820106506, | |
| "rewards/cosine_scaled_reward": -0.08158679166808724, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2021.0000305175781, | |
| "epoch": 0.352, | |
| "grad_norm": 1.4621251821517944, | |
| "kl": 0.2753448486328125, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.011, | |
| "reward": 0.09099714574404061, | |
| "reward_std": 0.14960693335160613, | |
| "rewards/cosine_scaled_reward": -0.09915194474160671, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2146.7083740234375, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 2.272526502609253, | |
| "kl": 0.4404144287109375, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0176, | |
| "reward": 0.09269980387762189, | |
| "reward_std": 0.14243671763688326, | |
| "rewards/cosine_scaled_reward": -0.06128171645104885, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1203.4791946411133, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 1.225544810295105, | |
| "kl": 0.16626739501953125, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0066, | |
| "reward": 0.10261391778476536, | |
| "reward_std": 0.08983314875513315, | |
| "rewards/cosine_scaled_reward": -0.16803579218685627, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1146.8125343322754, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 1.5572764873504639, | |
| "kl": 0.07305145263671875, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0029, | |
| "reward": 0.19523340463638306, | |
| "reward_std": 0.14635545574128628, | |
| "rewards/cosine_scaled_reward": 0.08061425480991602, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1333.0625228881836, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 1.1539429426193237, | |
| "kl": 0.34932708740234375, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.014, | |
| "reward": 0.1985101569443941, | |
| "reward_std": 0.07966499030590057, | |
| "rewards/cosine_scaled_reward": 0.1561545841395855, | |
| "rewards/format_reward": 0.8541666697710752, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2031.31254196167, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 1.5145570039749146, | |
| "kl": 0.6246414184570312, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0249, | |
| "reward": 0.11477423517499119, | |
| "reward_std": 0.11296425701584667, | |
| "rewards/cosine_scaled_reward": 0.012727040797472, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 1317.7083625793457, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 2.5971078872680664, | |
| "kl": 0.4033660888671875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0162, | |
| "reward": 0.18170432932674885, | |
| "reward_std": 0.13025694666430354, | |
| "rewards/cosine_scaled_reward": 0.10399175062775612, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2108.854248046875, | |
| "epoch": 0.36, | |
| "grad_norm": 2.6029951572418213, | |
| "kl": 0.965606689453125, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0386, | |
| "reward": 0.09199583710869774, | |
| "reward_std": 0.1252267644740641, | |
| "rewards/cosine_scaled_reward": -0.07803997304290533, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 1711.2917366027832, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 2.844820499420166, | |
| "kl": 0.6187896728515625, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0247, | |
| "reward": 0.04600826557725668, | |
| "reward_std": 0.08856615889817476, | |
| "rewards/cosine_scaled_reward": -0.20388950034976006, | |
| "rewards/format_reward": 0.6666666846722364, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 1576.0625076293945, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 2.0948660373687744, | |
| "kl": 0.4916229248046875, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0197, | |
| "reward": 0.11145175900310278, | |
| "reward_std": 0.11253520660102367, | |
| "rewards/cosine_scaled_reward": -0.05809229984879494, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1069.9166831970215, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 2.052123785018921, | |
| "kl": 0.205841064453125, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0082, | |
| "reward": 0.1357195656746626, | |
| "reward_std": 0.11260256776586175, | |
| "rewards/cosine_scaled_reward": -0.10857605282217264, | |
| "rewards/format_reward": 1.0, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 1185.145866394043, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 3.6290059089660645, | |
| "kl": 0.23030853271484375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0092, | |
| "reward": 0.039664710406214, | |
| "reward_std": 0.09799355687573552, | |
| "rewards/cosine_scaled_reward": -0.29441852401942015, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 822.8125267028809, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 10.01123046875, | |
| "kl": 0.42021942138671875, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0169, | |
| "reward": 0.14526783768087626, | |
| "reward_std": 0.1066361116245389, | |
| "rewards/cosine_scaled_reward": -0.05672251805663109, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 951.9166946411133, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 1.7355072498321533, | |
| "kl": 0.166778564453125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0067, | |
| "reward": 0.16883038450032473, | |
| "reward_std": 0.10923763830214739, | |
| "rewards/cosine_scaled_reward": 0.03077949397265911, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 1182.9167137145996, | |
| "epoch": 0.368, | |
| "grad_norm": 3.371811628341675, | |
| "kl": 0.32440185546875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.013, | |
| "reward": 0.10157357528805733, | |
| "reward_std": 0.09876813879236579, | |
| "rewards/cosine_scaled_reward": -0.07484898250550032, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 1447.1458587646484, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 2.1031486988067627, | |
| "kl": 0.4143524169921875, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0166, | |
| "reward": 0.0939664258621633, | |
| "reward_std": 0.11301013454794884, | |
| "rewards/cosine_scaled_reward": -0.08118974138051271, | |
| "rewards/format_reward": 0.7083333376795053, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 1413.3542251586914, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 1.4308388233184814, | |
| "kl": 0.449249267578125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.018, | |
| "reward": 0.054550049535464495, | |
| "reward_std": 0.1179595545399934, | |
| "rewards/cosine_scaled_reward": -0.20703712804242969, | |
| "rewards/format_reward": 0.7291666697710752, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 1824.2708702087402, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 2.757559299468994, | |
| "kl": 1.072296142578125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0429, | |
| "reward": 0.08934608940035105, | |
| "reward_std": 0.12084951438009739, | |
| "rewards/cosine_scaled_reward": -0.030909400433301926, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 1115.270866394043, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 2.2729392051696777, | |
| "kl": 0.2136383056640625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0085, | |
| "reward": 0.16146898362785578, | |
| "reward_std": 0.14930668845772743, | |
| "rewards/cosine_scaled_reward": 0.023914188146591187, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 1361.3542022705078, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.9111059904098511, | |
| "kl": 0.24196624755859375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0097, | |
| "reward": 0.18883195845410228, | |
| "reward_std": 0.12315699364989996, | |
| "rewards/cosine_scaled_reward": 0.10393881611526012, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 1545.9375381469727, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 2.3425772190093994, | |
| "kl": 0.56195068359375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0225, | |
| "reward": 0.0559651258517988, | |
| "reward_std": 0.11655561625957489, | |
| "rewards/cosine_scaled_reward": -0.21696932520717382, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 979.2708511352539, | |
| "epoch": 0.376, | |
| "grad_norm": 3.092437744140625, | |
| "kl": 0.1307373046875, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0052, | |
| "reward": 0.16340086178388447, | |
| "reward_std": 0.10502722533419728, | |
| "rewards/cosine_scaled_reward": 0.060619720607064664, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 1155.9375305175781, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 2.5028579235076904, | |
| "kl": 0.5152435302734375, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0206, | |
| "reward": 0.0951260298024863, | |
| "reward_std": 0.10306549491360784, | |
| "rewards/cosine_scaled_reward": -0.17680510133504868, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 1770.0208892822266, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 1.7597198486328125, | |
| "kl": 0.6001739501953125, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.024, | |
| "reward": 0.10013224184513092, | |
| "reward_std": 0.11445806687697768, | |
| "rewards/cosine_scaled_reward": -0.0341799296438694, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 1468.270881652832, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 2.4370524883270264, | |
| "kl": 0.4661865234375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0187, | |
| "reward": 0.0839388279709965, | |
| "reward_std": 0.08322880789637566, | |
| "rewards/cosine_scaled_reward": -0.1475684866309166, | |
| "rewards/format_reward": 0.7708333544433117, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1169.4166946411133, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 1.17229163646698, | |
| "kl": 0.29107666015625, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0117, | |
| "reward": 0.12938493536785245, | |
| "reward_std": 0.13117663795128465, | |
| "rewards/cosine_scaled_reward": -0.09842105722054839, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 2001.9375305175781, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 2.2998111248016357, | |
| "kl": 1.0654296875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0426, | |
| "reward": 0.03344674052641494, | |
| "reward_std": 0.12521635321900249, | |
| "rewards/cosine_scaled_reward": -0.2376094851642847, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 1584.0625534057617, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 2.906867504119873, | |
| "kl": 0.5097579956054688, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0204, | |
| "reward": 0.11645684402901679, | |
| "reward_std": 0.12058440665714443, | |
| "rewards/cosine_scaled_reward": -0.04469235986471176, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 1674.354206085205, | |
| "epoch": 0.384, | |
| "grad_norm": 1.6613434553146362, | |
| "kl": 0.6746101379394531, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.027, | |
| "reward": 0.10816401499323547, | |
| "reward_std": 0.13928860798478127, | |
| "rewards/cosine_scaled_reward": 0.030387197621166706, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 1494.8334045410156, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 2.075910806655884, | |
| "kl": 0.5845260620117188, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0234, | |
| "reward": 0.09280223221867345, | |
| "reward_std": 0.1204152749851346, | |
| "rewards/cosine_scaled_reward": -0.17956852912902832, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1031.0625076293945, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 1.3386896848678589, | |
| "kl": 0.3839874267578125, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0154, | |
| "reward": 0.12651887256652117, | |
| "reward_std": 0.11517677642405033, | |
| "rewards/cosine_scaled_reward": -0.06461506709456444, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 1663.1250305175781, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 2.8500008583068848, | |
| "kl": 0.616424560546875, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0247, | |
| "reward": 0.09466042937128805, | |
| "reward_std": 0.09753736667335033, | |
| "rewards/cosine_scaled_reward": -0.1472719321027398, | |
| "rewards/format_reward": 0.8333333544433117, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 1387.4375381469727, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 2.0746471881866455, | |
| "kl": 0.4688873291015625, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0188, | |
| "reward": 0.1363295007031411, | |
| "reward_std": 0.12295715417712927, | |
| "rewards/cosine_scaled_reward": -0.040779574774205685, | |
| "rewards/format_reward": 0.875, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 1612.5208740234375, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 2.2794735431671143, | |
| "kl": 0.814697265625, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0326, | |
| "reward": 0.1513365504797548, | |
| "reward_std": 0.15308804996311665, | |
| "rewards/cosine_scaled_reward": 0.11684986762702465, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 1542.833381652832, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 2.162214756011963, | |
| "kl": 0.7668609619140625, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0307, | |
| "reward": 0.07486888614948839, | |
| "reward_std": 0.08258337597362697, | |
| "rewards/cosine_scaled_reward": -0.19311499642208219, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 1584.8750457763672, | |
| "epoch": 0.392, | |
| "grad_norm": 2.241345167160034, | |
| "kl": 0.691986083984375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0277, | |
| "reward": 0.13888886122731492, | |
| "reward_std": 0.1267297170124948, | |
| "rewards/cosine_scaled_reward": 0.026948151644319296, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 1414.5833587646484, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 4.952951908111572, | |
| "kl": 0.78118896484375, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0312, | |
| "reward": 0.1714508015429601, | |
| "reward_std": 0.1391323572024703, | |
| "rewards/cosine_scaled_reward": 0.07935434021055698, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 1180.333366394043, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 1.540398120880127, | |
| "kl": 0.5207366943359375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0209, | |
| "reward": 0.13360386714339256, | |
| "reward_std": 0.14522417169064283, | |
| "rewards/cosine_scaled_reward": -0.034749194979667664, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 1519.6875305175781, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 2.235276699066162, | |
| "kl": 0.7563095092773438, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0302, | |
| "reward": 0.06512028211727738, | |
| "reward_std": 0.11110542109236121, | |
| "rewards/cosine_scaled_reward": -0.17456839326769114, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1307.8750305175781, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 4.259415626525879, | |
| "kl": 0.570068359375, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0228, | |
| "reward": 0.08080136496573687, | |
| "reward_std": 0.10251538408920169, | |
| "rewards/cosine_scaled_reward": -0.22406550850064377, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 1316.1250381469727, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 1.8449828624725342, | |
| "kl": 0.8987274169921875, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0359, | |
| "reward": 0.11601153435185552, | |
| "reward_std": 0.12534239329397678, | |
| "rewards/cosine_scaled_reward": -0.09255390800535679, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1266.5000305175781, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 2.301177740097046, | |
| "kl": 0.484771728515625, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0194, | |
| "reward": 0.11150891752913594, | |
| "reward_std": 0.10962018929421902, | |
| "rewards/cosine_scaled_reward": -0.07765534892678261, | |
| "rewards/format_reward": 0.8125000037252903, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 1148.3125343322754, | |
| "epoch": 0.4, | |
| "grad_norm": 2.0247983932495117, | |
| "kl": 0.4848480224609375, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0194, | |
| "reward": 0.15430114115588367, | |
| "reward_std": 0.14347478654235601, | |
| "rewards/cosine_scaled_reward": 0.002343377098441124, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1485.7500457763672, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 2.1637814044952393, | |
| "kl": 0.888519287109375, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0356, | |
| "reward": 0.12535253415990155, | |
| "reward_std": 0.15357061475515366, | |
| "rewards/cosine_scaled_reward": -0.002679265569895506, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 1839.8125534057617, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 3.5423998832702637, | |
| "kl": 1.449371337890625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.058, | |
| "reward": 0.09909449616679922, | |
| "reward_std": 0.15971363987773657, | |
| "rewards/cosine_scaled_reward": -0.0335394795256434, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1224.6875305175781, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 3.0395312309265137, | |
| "kl": 0.6749420166015625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.027, | |
| "reward": 0.2176575637422502, | |
| "reward_std": 0.1126766725210473, | |
| "rewards/cosine_scaled_reward": 0.20828061178326607, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1273.708381652832, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 2.6028831005096436, | |
| "kl": 0.7369384765625, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0296, | |
| "reward": 0.12516162917017937, | |
| "reward_std": 0.09799297200515866, | |
| "rewards/cosine_scaled_reward": -0.0536028565838933, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 1033.4166946411133, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 8.765235900878906, | |
| "kl": 0.76263427734375, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0305, | |
| "reward": 0.2028479753062129, | |
| "reward_std": 0.13977694138884544, | |
| "rewards/cosine_scaled_reward": 0.11576181277632713, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 1755.3959197998047, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 8.445033073425293, | |
| "kl": 1.7131805419921875, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0685, | |
| "reward": 0.06232760299462825, | |
| "reward_std": 0.15073465276509523, | |
| "rewards/cosine_scaled_reward": -0.1625305749475956, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 1907.8750686645508, | |
| "epoch": 0.408, | |
| "grad_norm": 2.7435877323150635, | |
| "kl": 1.49609375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0599, | |
| "reward": 0.045256637153215706, | |
| "reward_std": 0.10663844272494316, | |
| "rewards/cosine_scaled_reward": -0.23459493229165673, | |
| "rewards/format_reward": 0.7291666846722364, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 1480.333366394043, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 2.5645945072174072, | |
| "kl": 0.68377685546875, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0273, | |
| "reward": 0.1269954121671617, | |
| "reward_std": 0.12238077353686094, | |
| "rewards/cosine_scaled_reward": -0.08821228123269975, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 1268.8750381469727, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 9.816429138183594, | |
| "kl": 1.206329345703125, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0483, | |
| "reward": 0.0941942217759788, | |
| "reward_std": 0.09892157511785626, | |
| "rewards/cosine_scaled_reward": -0.17397763207554817, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 1494.9583587646484, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 2.043757915496826, | |
| "kl": 0.8025665283203125, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0321, | |
| "reward": 0.14919199608266354, | |
| "reward_std": 0.16920089721679688, | |
| "rewards/cosine_scaled_reward": 0.01944921351969242, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1387.5000457763672, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 3.627060890197754, | |
| "kl": 0.604827880859375, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0243, | |
| "reward": 0.0996842200984247, | |
| "reward_std": 0.12515971716493368, | |
| "rewards/cosine_scaled_reward": -0.14535773918032646, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1282.7291946411133, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 2.7785682678222656, | |
| "kl": 0.9784774780273438, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0392, | |
| "reward": 0.15694484941195697, | |
| "reward_std": 0.08703827066347003, | |
| "rewards/cosine_scaled_reward": 0.005612561479210854, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 981.1875267028809, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 2.8391177654266357, | |
| "kl": 0.4031524658203125, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0162, | |
| "reward": 0.20889083773363382, | |
| "reward_std": 0.09412059234455228, | |
| "rewards/cosine_scaled_reward": 0.14475639257580042, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1328.2500457763672, | |
| "epoch": 0.416, | |
| "grad_norm": 1.4150444269180298, | |
| "kl": 0.5859603881835938, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0235, | |
| "reward": 0.05011219787411392, | |
| "reward_std": 0.09909166162833571, | |
| "rewards/cosine_scaled_reward": -0.26201771944761276, | |
| "rewards/format_reward": 0.8125000260770321, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1732.4375457763672, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 3.007594585418701, | |
| "kl": 1.2857818603515625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0514, | |
| "reward": 0.13636061176657677, | |
| "reward_std": 0.1195504111237824, | |
| "rewards/cosine_scaled_reward": 0.015712120453827083, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1108.0000495910645, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 4.287596702575684, | |
| "kl": 0.290069580078125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0116, | |
| "reward": 0.15384384151548147, | |
| "reward_std": 0.1040445901453495, | |
| "rewards/cosine_scaled_reward": 0.009631453547626734, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1712.8542022705078, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 2.6099181175231934, | |
| "kl": 0.9123382568359375, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0365, | |
| "reward": 0.0756345079280436, | |
| "reward_std": 0.11609864910133183, | |
| "rewards/cosine_scaled_reward": -0.12630227487534285, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1932.4375381469727, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 4.4302215576171875, | |
| "kl": 1.157470703125, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0463, | |
| "reward": 0.05102244240697473, | |
| "reward_std": 0.14196251472458243, | |
| "rewards/cosine_scaled_reward": -0.1656601596623659, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1801.0000686645508, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 2.231231451034546, | |
| "kl": 1.43145751953125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0572, | |
| "reward": 0.13401565965614282, | |
| "reward_std": 0.1620243601500988, | |
| "rewards/cosine_scaled_reward": 0.007201282307505608, | |
| "rewards/format_reward": 0.7500000204890966, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 1558.6250267028809, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 2.5228145122528076, | |
| "kl": 1.020751953125, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0408, | |
| "reward": 0.09310736267070752, | |
| "reward_std": 0.09267222543712705, | |
| "rewards/cosine_scaled_reward": -0.09523181803524494, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 933.7292060852051, | |
| "epoch": 0.424, | |
| "grad_norm": 1.8327668905258179, | |
| "kl": 0.6253280639648438, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.025, | |
| "reward": 0.1391057469882071, | |
| "reward_std": 0.09823393146507442, | |
| "rewards/cosine_scaled_reward": 0.023010117933154106, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 1648.6458740234375, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 3.7653868198394775, | |
| "kl": 0.7486572265625, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0299, | |
| "reward": 0.11916764298803173, | |
| "reward_std": 0.16286169085651636, | |
| "rewards/cosine_scaled_reward": -0.006715672556310892, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 899.7083587646484, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 2.0490570068359375, | |
| "kl": 0.393463134765625, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0157, | |
| "reward": 0.11241465236525983, | |
| "reward_std": 0.1083641320001334, | |
| "rewards/cosine_scaled_reward": -0.0999544644728303, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1088.1042022705078, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 2.575913667678833, | |
| "kl": 0.3724212646484375, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0149, | |
| "reward": 0.1537247821688652, | |
| "reward_std": 0.144410849083215, | |
| "rewards/cosine_scaled_reward": 0.0019068364053964615, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1255.9375457763672, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 4.075099468231201, | |
| "kl": 1.0, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.04, | |
| "reward": 0.11958665121346712, | |
| "reward_std": 0.17436719313263893, | |
| "rewards/cosine_scaled_reward": -0.02865603007376194, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1406.4583740234375, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 4.192153453826904, | |
| "kl": 1.0216064453125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0409, | |
| "reward": 0.11533228470943868, | |
| "reward_std": 0.1220639725215733, | |
| "rewards/cosine_scaled_reward": -0.10069759003818035, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 1517.8542175292969, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 3.5561301708221436, | |
| "kl": 0.7474365234375, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0299, | |
| "reward": 0.06857469491660595, | |
| "reward_std": 0.12185074761509895, | |
| "rewards/cosine_scaled_reward": -0.20741149224340916, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1371.6458797454834, | |
| "epoch": 0.432, | |
| "grad_norm": 2.8652353286743164, | |
| "kl": 0.5273590087890625, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0211, | |
| "reward": 0.15830194216687232, | |
| "reward_std": 0.12496394384652376, | |
| "rewards/cosine_scaled_reward": 0.02524256706237793, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1719.6666946411133, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 1.9505208730697632, | |
| "kl": 0.86883544921875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0347, | |
| "reward": 0.07183210924267769, | |
| "reward_std": 0.08735731011256576, | |
| "rewards/cosine_scaled_reward": -0.19765841774642467, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 1268.458366394043, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 2.074549436569214, | |
| "kl": 0.7683868408203125, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0308, | |
| "reward": 0.09168538171797991, | |
| "reward_std": 0.1286314483731985, | |
| "rewards/cosine_scaled_reward": -0.10583510436117649, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1851.1042251586914, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 3.475543975830078, | |
| "kl": 1.308349609375, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0523, | |
| "reward": 0.017921562888659537, | |
| "reward_std": 0.10069577465765178, | |
| "rewards/cosine_scaled_reward": -0.26895514875650406, | |
| "rewards/format_reward": 0.6458333525806665, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1056.208351135254, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 3.336723566055298, | |
| "kl": 0.65924072265625, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0264, | |
| "reward": 0.0698289682622999, | |
| "reward_std": 0.10450809169560671, | |
| "rewards/cosine_scaled_reward": -0.20110327936708927, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1278.520881652832, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 1.7827008962631226, | |
| "kl": 1.0176849365234375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0407, | |
| "reward": 0.15380790340714157, | |
| "reward_std": 0.1372139612212777, | |
| "rewards/cosine_scaled_reward": 0.03767992998473346, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1171.3541946411133, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 2.2583577632904053, | |
| "kl": 0.81756591796875, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0327, | |
| "reward": 0.21438380563631654, | |
| "reward_std": 0.18565166369080544, | |
| "rewards/cosine_scaled_reward": 0.19361148471944034, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1468.4584121704102, | |
| "epoch": 0.44, | |
| "grad_norm": 3.9643666744232178, | |
| "kl": 0.9686126708984375, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0388, | |
| "reward": 0.09368883771821856, | |
| "reward_std": 0.12416216172277927, | |
| "rewards/cosine_scaled_reward": -0.13376147765666246, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1320.7292022705078, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 2.8084425926208496, | |
| "kl": 0.906951904296875, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0363, | |
| "reward": 0.18752056313678622, | |
| "reward_std": 0.09679444809444249, | |
| "rewards/cosine_scaled_reward": 0.0695192702114582, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 1446.6667175292969, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 2.2158679962158203, | |
| "kl": 0.94366455078125, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0378, | |
| "reward": 0.13372905366122723, | |
| "reward_std": 0.09293439192697406, | |
| "rewards/cosine_scaled_reward": -0.017829248681664467, | |
| "rewards/format_reward": 0.8125000204890966, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1416.458366394043, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 3.4314982891082764, | |
| "kl": 1.20458984375, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0482, | |
| "reward": 0.10979634639807045, | |
| "reward_std": 0.14348772866651416, | |
| "rewards/cosine_scaled_reward": -0.09102598764002323, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1301.3750381469727, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 2.1517300605773926, | |
| "kl": 0.8640289306640625, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0345, | |
| "reward": 0.12174346391111612, | |
| "reward_std": 0.1113532236777246, | |
| "rewards/cosine_scaled_reward": -0.11768272100016475, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1864.270881652832, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 2.703826665878296, | |
| "kl": 1.3235321044921875, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.053, | |
| "reward": 0.08123930124565959, | |
| "reward_std": 0.12660485692322254, | |
| "rewards/cosine_scaled_reward": -0.06472943164408207, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 1732.4167137145996, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 3.1918063163757324, | |
| "kl": 1.76708984375, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0708, | |
| "reward": 0.14286700636148453, | |
| "reward_std": 0.18649776838719845, | |
| "rewards/cosine_scaled_reward": 0.028382533695548773, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1371.7916946411133, | |
| "epoch": 0.448, | |
| "grad_norm": 3.6219229698181152, | |
| "kl": 0.9765167236328125, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.039, | |
| "reward": 0.06799169280566275, | |
| "reward_std": 0.09887422667816281, | |
| "rewards/cosine_scaled_reward": -0.19014379568398, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1691.1666870117188, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 2.618884801864624, | |
| "kl": 1.6240234375, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0649, | |
| "reward": 0.06132141873240471, | |
| "reward_std": 0.10926410043612123, | |
| "rewards/cosine_scaled_reward": -0.12576089892536402, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 1537.270866394043, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 6.063929557800293, | |
| "kl": 1.8046112060546875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0722, | |
| "reward": 0.0685962769202888, | |
| "reward_std": 0.09430173807777464, | |
| "rewards/cosine_scaled_reward": -0.1982439812272787, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 1333.958396911621, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 4.921578884124756, | |
| "kl": 1.4940185546875, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0598, | |
| "reward": 0.1565821710973978, | |
| "reward_std": 0.11654932564124465, | |
| "rewards/cosine_scaled_reward": 0.03457294497638941, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1335.645866394043, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 2.5358009338378906, | |
| "kl": 0.828582763671875, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0332, | |
| "reward": 0.143600944429636, | |
| "reward_std": 0.12354162661358714, | |
| "rewards/cosine_scaled_reward": -0.04586084187030792, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 1311.5625381469727, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 3.427694082260132, | |
| "kl": 0.9869384765625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0395, | |
| "reward": 0.08522352996078553, | |
| "reward_std": 0.11573155457153916, | |
| "rewards/cosine_scaled_reward": -0.11667108163237572, | |
| "rewards/format_reward": 0.7291666883975267, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1535.7500457763672, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 1.9625768661499023, | |
| "kl": 1.721282958984375, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0688, | |
| "reward": 0.10138712753541768, | |
| "reward_std": 0.12253602081909776, | |
| "rewards/cosine_scaled_reward": -0.0959820756688714, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 1052.1875305175781, | |
| "epoch": 0.456, | |
| "grad_norm": 2.016707181930542, | |
| "kl": 0.39638519287109375, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0159, | |
| "reward": 0.20977385994046926, | |
| "reward_std": 0.17129726987332106, | |
| "rewards/cosine_scaled_reward": 0.13206836581230164, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 998.1458587646484, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.5737963914871216, | |
| "kl": 0.47527313232421875, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.019, | |
| "reward": 0.20247652614489198, | |
| "reward_std": 0.11905267764814198, | |
| "rewards/cosine_scaled_reward": 0.13497311808168888, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 1659.1250305175781, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 2.9050798416137695, | |
| "kl": 1.2333984375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0493, | |
| "reward": 0.13014382123947144, | |
| "reward_std": 0.12995108915492892, | |
| "rewards/cosine_scaled_reward": -0.019444716162979603, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 1586.0417175292969, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 2.764214277267456, | |
| "kl": 1.1788330078125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0472, | |
| "reward": 0.11509460117667913, | |
| "reward_std": 0.12621240504086018, | |
| "rewards/cosine_scaled_reward": -0.03602955490350723, | |
| "rewards/format_reward": 0.7291666939854622, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1443.87504196167, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 2.6378586292266846, | |
| "kl": 1.0952301025390625, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0438, | |
| "reward": 0.14914248324930668, | |
| "reward_std": 0.08150437835138291, | |
| "rewards/cosine_scaled_reward": 0.02718578651547432, | |
| "rewards/format_reward": 0.8125000204890966, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1235.4792098999023, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 2.0312211513519287, | |
| "kl": 0.6242218017578125, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.025, | |
| "reward": 0.15180272003635764, | |
| "reward_std": 0.10177731700241566, | |
| "rewards/cosine_scaled_reward": -0.007191255688667297, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 1179.3958587646484, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 2.201613664627075, | |
| "kl": 0.848846435546875, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0339, | |
| "reward": 0.21597139816731215, | |
| "reward_std": 0.16427310602739453, | |
| "rewards/cosine_scaled_reward": 0.154250493273139, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1454.9166946411133, | |
| "epoch": 0.464, | |
| "grad_norm": 3.2176761627197266, | |
| "kl": 1.196868896484375, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0477, | |
| "reward": 0.12235562037676573, | |
| "reward_std": 0.12256110971793532, | |
| "rewards/cosine_scaled_reward": -0.045554774114862084, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 1515.4792289733887, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 3.040894031524658, | |
| "kl": 0.8402099609375, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0336, | |
| "reward": 0.15898172045126557, | |
| "reward_std": 0.10987926088273525, | |
| "rewards/cosine_scaled_reward": 0.03327514789998531, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 1372.5625534057617, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 1.9494367837905884, | |
| "kl": 0.5256500244140625, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.021, | |
| "reward": 0.18118306156247854, | |
| "reward_std": 0.1542639322578907, | |
| "rewards/cosine_scaled_reward": 0.08791719190776348, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 2067.58341217041, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 2.919564962387085, | |
| "kl": 1.49102783203125, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0597, | |
| "reward": 0.07456145505420864, | |
| "reward_std": 0.12785195047035813, | |
| "rewards/cosine_scaled_reward": -0.12900145258754492, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 1623.2500228881836, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 2.359740972518921, | |
| "kl": 1.49200439453125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0597, | |
| "reward": 0.12223792565055192, | |
| "reward_std": 0.13907577726058662, | |
| "rewards/cosine_scaled_reward": -0.026431459933519363, | |
| "rewards/format_reward": 0.7500000167638063, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 1614.6250534057617, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 2.7134463787078857, | |
| "kl": 0.764923095703125, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0306, | |
| "reward": 0.07999165914952755, | |
| "reward_std": 0.12184700695797801, | |
| "rewards/cosine_scaled_reward": -0.16527405753731728, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 1176.7291946411133, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 3.365088701248169, | |
| "kl": 0.94677734375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0378, | |
| "reward": 0.10302653594408184, | |
| "reward_std": 0.13904161704704165, | |
| "rewards/cosine_scaled_reward": -0.11917255260050297, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1315.0833854675293, | |
| "epoch": 0.472, | |
| "grad_norm": 3.6166837215423584, | |
| "kl": 0.6557846069335938, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0263, | |
| "reward": 0.13768333243206143, | |
| "reward_std": 0.12120772805064917, | |
| "rewards/cosine_scaled_reward": -0.05770981824025512, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2013.416748046875, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 2.6909852027893066, | |
| "kl": 1.70458984375, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0682, | |
| "reward": 0.04394434345886111, | |
| "reward_std": 0.12014323053881526, | |
| "rewards/cosine_scaled_reward": -0.24098782893270254, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 1488.583366394043, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 2.07572340965271, | |
| "kl": 0.9158935546875, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0366, | |
| "reward": 0.09156056097708642, | |
| "reward_std": 0.1220615403726697, | |
| "rewards/cosine_scaled_reward": -0.16554791200906038, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1202.708366394043, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 2.5329267978668213, | |
| "kl": 0.7248611450195312, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.029, | |
| "reward": 0.20120010571554303, | |
| "reward_std": 0.16196386714000255, | |
| "rewards/cosine_scaled_reward": 0.1876464392989874, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 1595.958366394043, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 2.1365303993225098, | |
| "kl": 0.925079345703125, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.037, | |
| "reward": 0.13611510070040822, | |
| "reward_std": 0.10463207168504596, | |
| "rewards/cosine_scaled_reward": -0.014729505404829979, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1091.2292137145996, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 3.3795764446258545, | |
| "kl": 0.7227020263671875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0289, | |
| "reward": 0.21031097415834665, | |
| "reward_std": 0.12943580746650696, | |
| "rewards/cosine_scaled_reward": 0.15948706772178411, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 1551.8750610351562, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 2.417349100112915, | |
| "kl": 0.9874191284179688, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0395, | |
| "reward": 0.1355010142287938, | |
| "reward_std": 0.15785679733380675, | |
| "rewards/cosine_scaled_reward": -0.02094801003113389, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 945.6041870117188, | |
| "epoch": 0.48, | |
| "grad_norm": 1.6603590250015259, | |
| "kl": 0.618865966796875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0248, | |
| "reward": 0.10752899164799601, | |
| "reward_std": 0.09324449067935348, | |
| "rewards/cosine_scaled_reward": -0.155439174734056, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 1413.1042251586914, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 2.401996374130249, | |
| "kl": 1.11029052734375, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0444, | |
| "reward": 0.08400759304640815, | |
| "reward_std": 0.13007194455713034, | |
| "rewards/cosine_scaled_reward": -0.18536985479295254, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 1390.4375305175781, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 2.419326066970825, | |
| "kl": 1.081390380859375, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0433, | |
| "reward": 0.13678289717063308, | |
| "reward_std": 0.11409326584544033, | |
| "rewards/cosine_scaled_reward": -0.019857976818457246, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 1838.7917022705078, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.9876563549041748, | |
| "kl": 1.2999267578125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.052, | |
| "reward": 0.05111281352583319, | |
| "reward_std": 0.11355760088190436, | |
| "rewards/cosine_scaled_reward": -0.21695283614099026, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 1542.2292251586914, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 2.6723501682281494, | |
| "kl": 1.047271728515625, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0418, | |
| "reward": 0.09121810318902135, | |
| "reward_std": 0.11254043271765113, | |
| "rewards/cosine_scaled_reward": -0.18606012500822544, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1012.3541717529297, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 2.142672061920166, | |
| "kl": 0.32970428466796875, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0132, | |
| "reward": 0.27949744602665305, | |
| "reward_std": 0.13918299926444888, | |
| "rewards/cosine_scaled_reward": 0.35947928391397, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1376.7917022705078, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 2.2903144359588623, | |
| "kl": 1.4896697998046875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0595, | |
| "reward": 0.10234351572580636, | |
| "reward_std": 0.1212971555069089, | |
| "rewards/cosine_scaled_reward": -0.1176013108342886, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 1822.0208892822266, | |
| "epoch": 0.488, | |
| "grad_norm": 3.0833256244659424, | |
| "kl": 0.966033935546875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0387, | |
| "reward": 0.1447278270497918, | |
| "reward_std": 0.1653783330693841, | |
| "rewards/cosine_scaled_reward": 0.011570073664188385, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 1313.0625228881836, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 1.966538906097412, | |
| "kl": 0.5592117309570312, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0223, | |
| "reward": 0.12443019635975361, | |
| "reward_std": 0.11524406261742115, | |
| "rewards/cosine_scaled_reward": -0.076734006870538, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 1229.2916870117188, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 1.638811707496643, | |
| "kl": 1.2337493896484375, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0494, | |
| "reward": 0.147400954447221, | |
| "reward_std": 0.1352155078202486, | |
| "rewards/cosine_scaled_reward": -0.012214789167046547, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 979.6458587646484, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.6275877952575684, | |
| "kl": 0.34814453125, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0139, | |
| "reward": 0.1619871830334887, | |
| "reward_std": 0.11312393890693784, | |
| "rewards/cosine_scaled_reward": 0.02661276888102293, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 1446.2292175292969, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 3.2351579666137695, | |
| "kl": 1.28387451171875, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0514, | |
| "reward": 0.07184328138828278, | |
| "reward_std": 0.0726720115635544, | |
| "rewards/cosine_scaled_reward": -0.19962184969335794, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 1804.979232788086, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 3.7058115005493164, | |
| "kl": 1.610595703125, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0645, | |
| "reward": 0.05518326349556446, | |
| "reward_std": 0.10842809174209833, | |
| "rewards/cosine_scaled_reward": -0.19627886731177568, | |
| "rewards/format_reward": 0.7083333525806665, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 1190.7292022705078, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 1.7561228275299072, | |
| "kl": 0.5131759643554688, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0205, | |
| "reward": 0.1619516264181584, | |
| "reward_std": 0.10530700022354722, | |
| "rewards/cosine_scaled_reward": 0.022388019686331972, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 1296.0417175292969, | |
| "epoch": 0.496, | |
| "grad_norm": 1.8874526023864746, | |
| "kl": 0.7233428955078125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0289, | |
| "reward": 0.06844042587908916, | |
| "reward_std": 0.08203518786467612, | |
| "rewards/cosine_scaled_reward": -0.23772956430912018, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 1286.6042213439941, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 2.515587568283081, | |
| "kl": 1.2847747802734375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0513, | |
| "reward": 0.06948329764418304, | |
| "reward_std": 0.09834526525810361, | |
| "rewards/cosine_scaled_reward": -0.22741128038614988, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1111.458366394043, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 1.83876633644104, | |
| "kl": 0.725067138671875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.029, | |
| "reward": 0.18536534893792123, | |
| "reward_std": 0.12061038403771818, | |
| "rewards/cosine_scaled_reward": 0.11224743165075779, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 1064.020866394043, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 2.1831390857696533, | |
| "kl": 0.4850616455078125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0194, | |
| "reward": 0.1590421856380999, | |
| "reward_std": 0.09123086743056774, | |
| "rewards/cosine_scaled_reward": -0.028785159811377525, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 1594.8750610351562, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 2.209979295730591, | |
| "kl": 0.8747100830078125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.035, | |
| "reward": 0.08124704580404796, | |
| "reward_std": 0.10059291700599715, | |
| "rewards/cosine_scaled_reward": -0.1798710956936702, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 1178.1875305175781, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 1.6698002815246582, | |
| "kl": 0.3531494140625, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0141, | |
| "reward": 0.07717898802366108, | |
| "reward_std": 0.06667181965894997, | |
| "rewards/cosine_scaled_reward": -0.24372687563300133, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 1250.7083549499512, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 3.8609611988067627, | |
| "kl": 0.908416748046875, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0363, | |
| "reward": 0.08707587665412575, | |
| "reward_std": 0.0901845304761082, | |
| "rewards/cosine_scaled_reward": -0.19678178988397121, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 1613.2292098999023, | |
| "epoch": 0.504, | |
| "grad_norm": 2.8707664012908936, | |
| "kl": 1.0423583984375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0417, | |
| "reward": 0.1019732168642804, | |
| "reward_std": 0.11968619748950005, | |
| "rewards/cosine_scaled_reward": -0.07690610364079475, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 1124.3125228881836, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 3.18782377243042, | |
| "kl": 0.6033172607421875, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0241, | |
| "reward": 0.1847122572362423, | |
| "reward_std": 0.11424465058371425, | |
| "rewards/cosine_scaled_reward": 0.06125026382505894, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 1618.208396911621, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 3.1647167205810547, | |
| "kl": 0.8673095703125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0347, | |
| "reward": 0.1230522379046306, | |
| "reward_std": 0.08929805480875075, | |
| "rewards/cosine_scaled_reward": -0.06928862258791924, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 1356.6458740234375, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 2.186877489089966, | |
| "kl": 0.634979248046875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0254, | |
| "reward": 0.11778642190620303, | |
| "reward_std": 0.10000652819871902, | |
| "rewards/cosine_scaled_reward": -0.13054069224745035, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1302.6042022705078, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 1.7782143354415894, | |
| "kl": 0.7170562744140625, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0287, | |
| "reward": 0.09130575158633292, | |
| "reward_std": 0.09221122646704316, | |
| "rewards/cosine_scaled_reward": -0.1979788908502087, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 1449.1667022705078, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 2.79809308052063, | |
| "kl": 0.5194091796875, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0207, | |
| "reward": 0.10592652973718941, | |
| "reward_std": 0.0978604587726295, | |
| "rewards/cosine_scaled_reward": -0.12308421358466148, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1229.6667137145996, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 1.967965006828308, | |
| "kl": 0.792236328125, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0317, | |
| "reward": 0.12869366817176342, | |
| "reward_std": 0.09020477347075939, | |
| "rewards/cosine_scaled_reward": -0.07901583984494209, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1233.4167022705078, | |
| "epoch": 0.512, | |
| "grad_norm": 5.0800299644470215, | |
| "kl": 0.8968582153320312, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0358, | |
| "reward": 0.12176637991797179, | |
| "reward_std": 0.07025636686012149, | |
| "rewards/cosine_scaled_reward": -0.10369249619543552, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1115.104206085205, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 5.752594947814941, | |
| "kl": 0.5258560180664062, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.021, | |
| "reward": 0.09505507163703442, | |
| "reward_std": 0.08659794740378857, | |
| "rewards/cosine_scaled_reward": -0.18167979642748833, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 1027.0416946411133, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 1.9456526041030884, | |
| "kl": 0.33565521240234375, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0134, | |
| "reward": 0.1392190819606185, | |
| "reward_std": 0.08027261192910373, | |
| "rewards/cosine_scaled_reward": -0.06442757230252028, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1365.3125495910645, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 2.0280954837799072, | |
| "kl": 0.9598846435546875, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0384, | |
| "reward": 0.14097292395308614, | |
| "reward_std": 0.1067453664727509, | |
| "rewards/cosine_scaled_reward": -0.06067850440740585, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 1629.979248046875, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 2.833832025527954, | |
| "kl": 1.052734375, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0421, | |
| "reward": 0.1490084226243198, | |
| "reward_std": 0.1209279503673315, | |
| "rewards/cosine_scaled_reward": 0.01308462768793106, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 1423.2083702087402, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 2.2027335166931152, | |
| "kl": 0.909820556640625, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0364, | |
| "reward": 0.12369064858648926, | |
| "reward_std": 0.12611138448119164, | |
| "rewards/cosine_scaled_reward": -0.08292348496615887, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 1188.7500381469727, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 2.987781524658203, | |
| "kl": 0.386993408203125, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0155, | |
| "reward": 0.11641452787443995, | |
| "reward_std": 0.0906132124364376, | |
| "rewards/cosine_scaled_reward": -0.12823456013575196, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 1531.6875457763672, | |
| "epoch": 0.52, | |
| "grad_norm": 2.0451388359069824, | |
| "kl": 0.9672393798828125, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0387, | |
| "reward": 0.08102269377559423, | |
| "reward_std": 0.10998783679679036, | |
| "rewards/cosine_scaled_reward": -0.17945297434926033, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 1520.6458587646484, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 1.6886857748031616, | |
| "kl": 0.6373214721679688, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0255, | |
| "reward": 0.04929352249018848, | |
| "reward_std": 0.10546231491025537, | |
| "rewards/cosine_scaled_reward": -0.23321845568716526, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 1495.3750534057617, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 3.0165514945983887, | |
| "kl": 0.941650390625, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0377, | |
| "reward": 0.09918425139039755, | |
| "reward_std": 0.12838075123727322, | |
| "rewards/cosine_scaled_reward": -0.124738659709692, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 1388.5208740234375, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.8302446603775024, | |
| "kl": 0.6633834838867188, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0266, | |
| "reward": 0.07702152655110694, | |
| "reward_std": 0.08541670115664601, | |
| "rewards/cosine_scaled_reward": -0.21506773598957807, | |
| "rewards/format_reward": 0.875, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1343.5208473205566, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 2.486727714538574, | |
| "kl": 0.926055908203125, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0371, | |
| "reward": 0.1338568499777466, | |
| "reward_std": 0.14767975080758333, | |
| "rewards/cosine_scaled_reward": -0.04322970123030245, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 1559.1250762939453, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 1.7466343641281128, | |
| "kl": 0.7754058837890625, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.031, | |
| "reward": 0.11281750063062645, | |
| "reward_std": 0.09168540453538299, | |
| "rewards/cosine_scaled_reward": -0.10402092151343822, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 1441.5417098999023, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 2.377612829208374, | |
| "kl": 0.6788558959960938, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0271, | |
| "reward": 0.12857838673517108, | |
| "reward_std": 0.1296999854966998, | |
| "rewards/cosine_scaled_reward": -0.040342007763683796, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 1260.895866394043, | |
| "epoch": 0.528, | |
| "grad_norm": 5.065842151641846, | |
| "kl": 0.7448501586914062, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0298, | |
| "reward": 0.08250287733972073, | |
| "reward_std": 0.11116986721754074, | |
| "rewards/cosine_scaled_reward": -0.15972350211814046, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 1197.4166946411133, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 2.4315950870513916, | |
| "kl": 0.316741943359375, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0126, | |
| "reward": 0.17255353461951017, | |
| "reward_std": 0.14624354103580117, | |
| "rewards/cosine_scaled_reward": 0.10790145858481992, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1026.7292098999023, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 1.8613412380218506, | |
| "kl": 0.590972900390625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0236, | |
| "reward": 0.17181929713115096, | |
| "reward_std": 0.12013115221634507, | |
| "rewards/cosine_scaled_reward": 0.02979774959385395, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 1346.9375305175781, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 1.951102614402771, | |
| "kl": 0.7594528198242188, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0304, | |
| "reward": 0.12367417407222092, | |
| "reward_std": 0.13810825487598777, | |
| "rewards/cosine_scaled_reward": -0.10004201903939247, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 1361.2708702087402, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 4.891315937042236, | |
| "kl": 0.6375732421875, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0255, | |
| "reward": 0.1684982028673403, | |
| "reward_std": 0.13606750033795834, | |
| "rewards/cosine_scaled_reward": 0.051418120972812176, | |
| "rewards/format_reward": 0.875, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 1727.2708740234375, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 3.141331434249878, | |
| "kl": 0.8231964111328125, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0329, | |
| "reward": 0.0738575104624033, | |
| "reward_std": 0.11307883076369762, | |
| "rewards/cosine_scaled_reward": -0.20494843367487192, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 1474.2708930969238, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 2.9029293060302734, | |
| "kl": 0.9022445678710938, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0361, | |
| "reward": 0.11058385335491039, | |
| "reward_std": 0.14076802507042885, | |
| "rewards/cosine_scaled_reward": -0.07730268314480782, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 1365.81254196167, | |
| "epoch": 0.536, | |
| "grad_norm": 2.5009942054748535, | |
| "kl": 0.7577972412109375, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0303, | |
| "reward": 0.14495484717190266, | |
| "reward_std": 0.13738009426742792, | |
| "rewards/cosine_scaled_reward": 0.003895143046975136, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 1764.3333740234375, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 3.4764883518218994, | |
| "kl": 1.04931640625, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.042, | |
| "reward": 0.12439248080772813, | |
| "reward_std": 0.1104172533378005, | |
| "rewards/cosine_scaled_reward": -0.042109834030270576, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 1697.2083587646484, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 3.593810558319092, | |
| "kl": 1.015869140625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0406, | |
| "reward": 0.16313168196938932, | |
| "reward_std": 0.1711752563714981, | |
| "rewards/cosine_scaled_reward": 0.08633797615766525, | |
| "rewards/format_reward": 0.7708333544433117, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 1437.3125457763672, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 2.227132558822632, | |
| "kl": 0.584442138671875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0234, | |
| "reward": 0.049232515739277005, | |
| "reward_std": 0.10539399227127433, | |
| "rewards/cosine_scaled_reward": -0.2601514309644699, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 1427.6667175292969, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 2.773785352706909, | |
| "kl": 0.7134475708007812, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0286, | |
| "reward": 0.06174903141800314, | |
| "reward_std": 0.0896931691095233, | |
| "rewards/cosine_scaled_reward": -0.2288785793352872, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 1671.583366394043, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 3.813199520111084, | |
| "kl": 1.1416015625, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0456, | |
| "reward": 0.22307377692777663, | |
| "reward_std": 0.1535223526880145, | |
| "rewards/cosine_scaled_reward": 0.24020265229046345, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 1295.3333587646484, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 1.257153868675232, | |
| "kl": 0.5095977783203125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0204, | |
| "reward": 0.11992977559566498, | |
| "reward_std": 0.13077597226947546, | |
| "rewards/cosine_scaled_reward": -0.1030871132388711, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 1405.2500457763672, | |
| "epoch": 0.544, | |
| "grad_norm": 1.7341384887695312, | |
| "kl": 0.8976593017578125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0359, | |
| "reward": 0.11575727723538876, | |
| "reward_std": 0.13892074767500162, | |
| "rewards/cosine_scaled_reward": -0.09050496038980782, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1334.520866394043, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 2.1544551849365234, | |
| "kl": 0.790008544921875, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0317, | |
| "reward": 0.1808297468814999, | |
| "reward_std": 0.15548726078122854, | |
| "rewards/cosine_scaled_reward": 0.09577041026204824, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 1640.0417098999023, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 2.7642574310302734, | |
| "kl": 1.0536422729492188, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0421, | |
| "reward": 0.12267802411224693, | |
| "reward_std": 0.11104590399190784, | |
| "rewards/cosine_scaled_reward": -0.03174232318997383, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 1397.3750457763672, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 3.118372678756714, | |
| "kl": 0.7571868896484375, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0302, | |
| "reward": 0.11781717138364911, | |
| "reward_std": 0.11601145751774311, | |
| "rewards/cosine_scaled_reward": -0.06828589458018541, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 1380.4375381469727, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 2.790198802947998, | |
| "kl": 1.3089599609375, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0523, | |
| "reward": 0.09327662736177444, | |
| "reward_std": 0.12549111153930426, | |
| "rewards/cosine_scaled_reward": -0.14611217193305492, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 1502.9375457763672, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 3.935354232788086, | |
| "kl": 0.792388916015625, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0317, | |
| "reward": 0.06033009593375027, | |
| "reward_std": 0.12011839542537928, | |
| "rewards/cosine_scaled_reward": -0.16942069120705128, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 1320.3541984558105, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 3.4273793697357178, | |
| "kl": 0.8202667236328125, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0328, | |
| "reward": 0.145433189580217, | |
| "reward_std": 0.13374328007921576, | |
| "rewards/cosine_scaled_reward": -0.03734024800360203, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 1709.2500534057617, | |
| "epoch": 0.552, | |
| "grad_norm": 3.1256120204925537, | |
| "kl": 1.09405517578125, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0438, | |
| "reward": 0.10491634625941515, | |
| "reward_std": 0.13583884108811617, | |
| "rewards/cosine_scaled_reward": -0.11350399069488049, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 1060.3125457763672, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 2.7416789531707764, | |
| "kl": 0.593719482421875, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0237, | |
| "reward": 0.13031714130192995, | |
| "reward_std": 0.12379100965335965, | |
| "rewards/cosine_scaled_reward": -0.09740070005500456, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1706.8333740234375, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 2.7037065029144287, | |
| "kl": 1.599609375, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.064, | |
| "reward": 0.07579714641906321, | |
| "reward_std": 0.1142753018066287, | |
| "rewards/cosine_scaled_reward": -0.1662511508911848, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 909.7500267028809, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 2.9626100063323975, | |
| "kl": 0.5513458251953125, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0221, | |
| "reward": 0.16371853230521083, | |
| "reward_std": 0.08725554682314396, | |
| "rewards/cosine_scaled_reward": -0.0292919734492898, | |
| "rewards/format_reward": 1.0, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 977.083366394043, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 1.2269593477249146, | |
| "kl": 0.354217529296875, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0142, | |
| "reward": 0.18861438240855932, | |
| "reward_std": 0.12223522993735969, | |
| "rewards/cosine_scaled_reward": 0.09728494752198458, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 1098.43754196167, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 3.2133853435516357, | |
| "kl": 0.5051727294921875, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0202, | |
| "reward": 0.13033967884257436, | |
| "reward_std": 0.08723578602075577, | |
| "rewards/cosine_scaled_reward": -0.08787534758448601, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 1465.5000457763672, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 4.108081817626953, | |
| "kl": 0.8738555908203125, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.035, | |
| "reward": 0.0360484067350626, | |
| "reward_std": 0.08054300001822412, | |
| "rewards/cosine_scaled_reward": -0.2918727397918701, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1255.0416984558105, | |
| "epoch": 0.56, | |
| "grad_norm": 1.202156662940979, | |
| "kl": 0.7819290161132812, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0313, | |
| "reward": 0.15311120147816837, | |
| "reward_std": 0.10377770848572254, | |
| "rewards/cosine_scaled_reward": -0.02998751401901245, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 1578.0625381469727, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 1.941928505897522, | |
| "kl": 0.84698486328125, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0339, | |
| "reward": 0.13159164262469858, | |
| "reward_std": 0.13246915489435196, | |
| "rewards/cosine_scaled_reward": -0.05548815353540704, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 1292.3333587646484, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 2.9671390056610107, | |
| "kl": 0.797515869140625, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0319, | |
| "reward": 0.13928497838787735, | |
| "reward_std": 0.10173067264258862, | |
| "rewards/cosine_scaled_reward": -0.08179534692317247, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 1064.8125228881836, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 2.1849365234375, | |
| "kl": 0.4040985107421875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0162, | |
| "reward": 0.13772490341216326, | |
| "reward_std": 0.14829062833450735, | |
| "rewards/cosine_scaled_reward": -0.07700726587790996, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1061.7291870117188, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 1.5284849405288696, | |
| "kl": 0.540435791015625, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0217, | |
| "reward": 0.12483312236145139, | |
| "reward_std": 0.1254338538274169, | |
| "rewards/cosine_scaled_reward": -0.08674540685024112, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 1627.0625305175781, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 5.17227029800415, | |
| "kl": 1.3586196899414062, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0544, | |
| "reward": 0.11015307196066715, | |
| "reward_std": 0.13126440905034542, | |
| "rewards/cosine_scaled_reward": -0.08530823234468699, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1575.4167098999023, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 3.1292481422424316, | |
| "kl": 1.2574386596679688, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0503, | |
| "reward": 0.1306152348406613, | |
| "reward_std": 0.08260433259420097, | |
| "rewards/cosine_scaled_reward": -0.013811783166602254, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1092.5208587646484, | |
| "epoch": 0.568, | |
| "grad_norm": 3.035909414291382, | |
| "kl": 0.6652145385742188, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0266, | |
| "reward": 0.17385427234694362, | |
| "reward_std": 0.09661010140553117, | |
| "rewards/cosine_scaled_reward": 0.058778489008545876, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 1320.1250457763672, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 1.9357889890670776, | |
| "kl": 0.88330078125, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0354, | |
| "reward": 0.13382654823362827, | |
| "reward_std": 0.11791369551792741, | |
| "rewards/cosine_scaled_reward": -0.06090674642473459, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 1220.7292022705078, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 2.041574239730835, | |
| "kl": 0.31233978271484375, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0125, | |
| "reward": 0.15456983912736177, | |
| "reward_std": 0.130745030939579, | |
| "rewards/cosine_scaled_reward": -0.008478153496980667, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 1320.8750457763672, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 3.362833023071289, | |
| "kl": 0.72802734375, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0292, | |
| "reward": 0.11918663769029081, | |
| "reward_std": 0.1335056396201253, | |
| "rewards/cosine_scaled_reward": -0.1146287601441145, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.012724974361105416, | |
| "train_runtime": 59499.213, | |
| "train_samples_per_second": 0.403, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |