Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
open-r1
trl
grpo
conversational
text-generation-inference
Instructions to use kangdawei/MMR-DR_GRPO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use kangdawei/MMR-DR_GRPO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="kangdawei/MMR-DR_GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("kangdawei/MMR-DR_GRPO") model = AutoModelForCausalLM.from_pretrained("kangdawei/MMR-DR_GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use kangdawei/MMR-DR_GRPO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "kangdawei/MMR-DR_GRPO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-DR_GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/kangdawei/MMR-DR_GRPO
- SGLang
How to use kangdawei/MMR-DR_GRPO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "kangdawei/MMR-DR_GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-DR_GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "kangdawei/MMR-DR_GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-DR_GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use kangdawei/MMR-DR_GRPO with Docker Model Runner:
docker model run hf.co/kangdawei/MMR-DR_GRPO
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.12744835019111633, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0681, | |
| "reward": 0.1723687592893839, | |
| "reward_std": 0.7976016625761986, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.061661649495363235, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0245, | |
| "reward": -0.018269629566930234, | |
| "reward_std": 0.44402940198779106, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3337.375030517578, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.10997606813907623, | |
| "kl": 4.9442052841186523e-05, | |
| "learning_rate": 6e-08, | |
| "loss": -0.0096, | |
| "reward": -0.3936590664088726, | |
| "reward_std": 0.5575782060623169, | |
| "rewards/cosine_scaled_reward": -0.19862568378448486, | |
| "rewards/format_reward": 0.1458333395421505, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2260.895881652832, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.15913932025432587, | |
| "kl": 2.993643283843994e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0445, | |
| "reward": 0.13440400827676058, | |
| "reward_std": 0.8942861538380384, | |
| "rewards/cosine_scaled_reward": -0.10464579728432, | |
| "rewards/format_reward": 0.6250000018626451, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3328.2916870117188, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.12190493941307068, | |
| "kl": 4.523247480392456e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0609, | |
| "reward": -0.27870709635317326, | |
| "reward_std": 0.7129274010658264, | |
| "rewards/cosine_scaled_reward": -0.2122665431816131, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3129.8333740234375, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.14417913556098938, | |
| "kl": 4.3526291847229004e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": -0.0215, | |
| "reward": -0.07921706140041351, | |
| "reward_std": 0.6614448297768831, | |
| "rewards/cosine_scaled_reward": -0.06220451416447759, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3113.729217529297, | |
| "epoch": 0.008, | |
| "grad_norm": 0.1703435778617859, | |
| "kl": 1.8851831555366516e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0934, | |
| "reward": -0.0697940494865179, | |
| "reward_std": 0.8283112272620201, | |
| "rewards/cosine_scaled_reward": -0.14727119728922844, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2710.6041870117188, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.1097821518778801, | |
| "kl": 2.4922192096710205e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.033, | |
| "reward": 0.20415285229682922, | |
| "reward_std": 0.7091612815856934, | |
| "rewards/cosine_scaled_reward": 0.06203982699662447, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3129.687530517578, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.1483897864818573, | |
| "kl": 4.6446919441223145e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0837, | |
| "reward": -0.2891757604666054, | |
| "reward_std": 0.7400874830782413, | |
| "rewards/cosine_scaled_reward": -0.21521726623177528, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2703.3750076293945, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.11335786432027817, | |
| "kl": 3.4183263778686523e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0303, | |
| "reward": -0.11180419684387743, | |
| "reward_std": 0.7680593952536583, | |
| "rewards/cosine_scaled_reward": -0.1549013671465218, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3326.854217529297, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.13309723138809204, | |
| "kl": 3.510713577270508e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0705, | |
| "reward": -0.4452022071927786, | |
| "reward_std": 0.6313146576285362, | |
| "rewards/cosine_scaled_reward": -0.23017787747085094, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2544.666717529297, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.14041368663311005, | |
| "kl": 3.600120544433594e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0277, | |
| "reward": 0.006800652190577239, | |
| "reward_std": 0.7336893752217293, | |
| "rewards/cosine_scaled_reward": -0.17795369494706392, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 3018.8333587646484, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.13021793961524963, | |
| "kl": 3.663450479507446e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0269, | |
| "reward": -0.03413328202441335, | |
| "reward_std": 0.6794986762106419, | |
| "rewards/cosine_scaled_reward": -0.06286561000160873, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2947.5208740234375, | |
| "epoch": 0.016, | |
| "grad_norm": 0.12336233258247375, | |
| "kl": 2.9928982257843018e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0508, | |
| "reward": 0.07709243893623352, | |
| "reward_std": 0.8241597190499306, | |
| "rewards/cosine_scaled_reward": -0.015953163150697947, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2676.5625228881836, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.047696515917778015, | |
| "kl": 3.113597631454468e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0069, | |
| "reward": 0.036701809614896774, | |
| "reward_std": 0.43677932769060135, | |
| "rewards/cosine_scaled_reward": -0.021881014108657837, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3442.0000610351562, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.12319929152727127, | |
| "kl": 3.999471664428711e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0369, | |
| "reward": -0.43698735162615776, | |
| "reward_std": 0.556250561028719, | |
| "rewards/cosine_scaled_reward": -0.20507843233644962, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2437.750011444092, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.14334183931350708, | |
| "kl": 3.676116466522217e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0128, | |
| "reward": 0.32777530141174793, | |
| "reward_std": 0.8251273017376661, | |
| "rewards/cosine_scaled_reward": 0.026506672613322735, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2874.2292098999023, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.1185201108455658, | |
| "kl": 1.5683472156524658e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0031, | |
| "reward": -0.024667851626873016, | |
| "reward_std": 0.8119602091610432, | |
| "rewards/cosine_scaled_reward": -0.12354222661815584, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3046.6458587646484, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.19736716151237488, | |
| "kl": 2.751254942268133e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0779, | |
| "reward": 0.09869576059281826, | |
| "reward_std": 0.8057069275528193, | |
| "rewards/cosine_scaled_reward": 0.01759765949100256, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2335.291732788086, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.21245542168617249, | |
| "kl": 1.4175660908222198e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.1185, | |
| "reward": 0.38306641951203346, | |
| "reward_std": 0.8724737018346786, | |
| "rewards/cosine_scaled_reward": 0.04800319205969572, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2711.3958435058594, | |
| "epoch": 0.024, | |
| "grad_norm": 0.15821056067943573, | |
| "kl": 2.4184584617614746e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0511, | |
| "reward": 0.1797822918742895, | |
| "reward_std": 0.9325026646256447, | |
| "rewards/cosine_scaled_reward": -0.0036781951785087585, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1910.2083892822266, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.14541339874267578, | |
| "kl": 2.391217276453972e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0295, | |
| "reward": 0.347878472879529, | |
| "reward_std": 0.6697478331625462, | |
| "rewards/cosine_scaled_reward": -0.004899490624666214, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2642.2083740234375, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.13504153490066528, | |
| "kl": 2.520345151424408e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0768, | |
| "reward": 0.019413750036619604, | |
| "reward_std": 0.9948069639503956, | |
| "rewards/cosine_scaled_reward": -0.10891764007828897, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2697.000015258789, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.13816049695014954, | |
| "kl": 2.109631896018982e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0697, | |
| "reward": 0.3483977415598929, | |
| "reward_std": 0.8868874367326498, | |
| "rewards/cosine_scaled_reward": 0.06968503817915916, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2679.3958587646484, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.10494664311408997, | |
| "kl": 3.129430115222931e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.001, | |
| "reward": 0.1587733030319214, | |
| "reward_std": 0.7055602557957172, | |
| "rewards/cosine_scaled_reward": 0.02119587583001703, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3099.979217529297, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.075159452855587, | |
| "kl": 2.6673078536987305e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.003, | |
| "reward": -0.006685070693492889, | |
| "reward_std": 0.5016829147934914, | |
| "rewards/cosine_scaled_reward": -0.08887793682515621, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2958.8958740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.13963520526885986, | |
| "kl": 1.3343989849090576e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0628, | |
| "reward": 0.024143089074641466, | |
| "reward_std": 0.7820315174758434, | |
| "rewards/cosine_scaled_reward": -0.0528924111276865, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2831.958366394043, | |
| "epoch": 0.032, | |
| "grad_norm": 0.0931120440363884, | |
| "kl": 2.434663474559784e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.045, | |
| "reward": -0.018430547177558765, | |
| "reward_std": 0.626394847407937, | |
| "rewards/cosine_scaled_reward": -0.051781938411295414, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3351.750030517578, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.10650024563074112, | |
| "kl": 2.886354923248291e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0438, | |
| "reward": -0.466870941221714, | |
| "reward_std": 0.4314545188099146, | |
| "rewards/cosine_scaled_reward": -0.2414477914571762, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2859.812545776367, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.18510645627975464, | |
| "kl": 1.806020736694336e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.1004, | |
| "reward": 0.23486553132534027, | |
| "reward_std": 1.0620297119021416, | |
| "rewards/cosine_scaled_reward": 0.04186862939968705, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3150.250030517578, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.11023243516683578, | |
| "kl": 1.753866672515869e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0007, | |
| "reward": -0.2325914899702184, | |
| "reward_std": 0.6160467248409986, | |
| "rewards/cosine_scaled_reward": -0.16382855689153075, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3241.166717529297, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.16251012682914734, | |
| "kl": 3.0582770705223083e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.078, | |
| "reward": -0.17190912459045649, | |
| "reward_std": 0.7624128982424736, | |
| "rewards/cosine_scaled_reward": -0.11033781431615353, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3253.625030517578, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.1084270030260086, | |
| "kl": 4.7340989112854004e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0213, | |
| "reward": -0.2144899107515812, | |
| "reward_std": 0.7134921476244926, | |
| "rewards/cosine_scaled_reward": -0.15857082698494196, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2398.7708587646484, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.14240942895412445, | |
| "kl": 6.242096424102783e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0497, | |
| "reward": 0.31558607518672943, | |
| "reward_std": 0.8723539188504219, | |
| "rewards/cosine_scaled_reward": 0.06751577369868755, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3059.1667098999023, | |
| "epoch": 0.04, | |
| "grad_norm": 0.13201534748077393, | |
| "kl": 4.059448838233948e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0401, | |
| "reward": -0.0117220189422369, | |
| "reward_std": 0.9031526632606983, | |
| "rewards/cosine_scaled_reward": -0.04519825894385576, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3364.4791870117188, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.09137182682752609, | |
| "kl": 6.282329559326172e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0275, | |
| "reward": -0.37299076607450843, | |
| "reward_std": 0.5668806284666061, | |
| "rewards/cosine_scaled_reward": -0.19870344595983624, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3180.062530517578, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.06048983708024025, | |
| "kl": 6.483495235443115e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0165, | |
| "reward": -0.4461427731439471, | |
| "reward_std": 0.4245890639722347, | |
| "rewards/cosine_scaled_reward": -0.26771107502281666, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3264.937530517578, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.09965316206216812, | |
| "kl": 6.621703505516052e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0253, | |
| "reward": -0.2411605268716812, | |
| "reward_std": 0.5170722529292107, | |
| "rewards/cosine_scaled_reward": -0.10409475707274396, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2869.1875534057617, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.05097671225667, | |
| "kl": 7.027853280305862e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0051, | |
| "reward": 0.1772767100483179, | |
| "reward_std": 0.425695575773716, | |
| "rewards/cosine_scaled_reward": 0.008680417202413082, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2741.87508392334, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.1285083144903183, | |
| "kl": 0.00026188790798187256, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0503, | |
| "reward": -0.07532945368438959, | |
| "reward_std": 0.5845174305140972, | |
| "rewards/cosine_scaled_reward": -0.10031583718955517, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3102.6041870117188, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.11375687271356583, | |
| "kl": 9.709596633911133e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0088, | |
| "reward": -0.23022597841918468, | |
| "reward_std": 0.6951021775603294, | |
| "rewards/cosine_scaled_reward": -0.16865359526127577, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2861.2708435058594, | |
| "epoch": 0.048, | |
| "grad_norm": 0.04051314666867256, | |
| "kl": 8.571147918701172e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": -0.4261672543361783, | |
| "reward_std": 0.2807927541434765, | |
| "rewards/cosine_scaled_reward": -0.277394006960094, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3074.937530517578, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.099971242249012, | |
| "kl": 7.382780313491821e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0538, | |
| "reward": -0.23065751791000366, | |
| "reward_std": 0.6040490940213203, | |
| "rewards/cosine_scaled_reward": -0.13301679654978216, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2720.145881652832, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.10246309638023376, | |
| "kl": 0.0003544166684150696, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": -0.011, | |
| "reward": 0.011595025658607483, | |
| "reward_std": 0.5640863105654716, | |
| "rewards/cosine_scaled_reward": -0.05820102244615555, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3374.291717529297, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.11695674061775208, | |
| "kl": 0.00015322864055633545, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0382, | |
| "reward": -0.16595029830932617, | |
| "reward_std": 0.7633817754685879, | |
| "rewards/cosine_scaled_reward": -0.08942665439099073, | |
| "rewards/format_reward": 0.20833333767950535, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3254.958335876465, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.06262751668691635, | |
| "kl": 0.0002702465280890465, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0021, | |
| "reward": -0.45498750917613506, | |
| "reward_std": 0.3716874625533819, | |
| "rewards/cosine_scaled_reward": -0.24009231384843588, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2913.1250610351562, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.13851620256900787, | |
| "kl": 0.0001294887624680996, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.1169, | |
| "reward": 0.13789150305092335, | |
| "reward_std": 0.9271005466580391, | |
| "rewards/cosine_scaled_reward": 0.006347283720970154, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2858.250045776367, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.12427303940057755, | |
| "kl": 0.0005601570010185242, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0265, | |
| "reward": -0.051424789475277066, | |
| "reward_std": 0.813489394262433, | |
| "rewards/cosine_scaled_reward": -0.07831720494141337, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2364.562530517578, | |
| "epoch": 0.056, | |
| "grad_norm": 0.09100354462862015, | |
| "kl": 0.0003460892476141453, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0101, | |
| "reward": 0.31277667777612805, | |
| "reward_std": 0.7601135969161987, | |
| "rewards/cosine_scaled_reward": 0.04609484411776066, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2957.937530517578, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.11310350149869919, | |
| "kl": 0.0003506038337945938, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0231, | |
| "reward": 0.07819816470146179, | |
| "reward_std": 0.5129979718476534, | |
| "rewards/cosine_scaled_reward": 0.04247652553021908, | |
| "rewards/format_reward": 0.3125, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2296.833335876465, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.09167315810918808, | |
| "kl": 0.0011107921600341797, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0018, | |
| "reward": 0.04252960532903671, | |
| "reward_std": 0.607517946511507, | |
| "rewards/cosine_scaled_reward": -0.12472434528172016, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2910.0417289733887, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.1661226898431778, | |
| "kl": 0.0011932700872421265, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0441, | |
| "reward": 0.25355312041938305, | |
| "reward_std": 1.1632588431239128, | |
| "rewards/cosine_scaled_reward": 0.05952752288430929, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2785.2500610351562, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.1787281036376953, | |
| "kl": 0.0006063804030418396, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0319, | |
| "reward": 0.1302148699760437, | |
| "reward_std": 0.8732462916523218, | |
| "rewards/cosine_scaled_reward": -0.028057849034667015, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2941.104248046875, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.21909195184707642, | |
| "kl": 0.00023331446573138237, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.1682, | |
| "reward": 0.3165966849774122, | |
| "reward_std": 1.0916050113737583, | |
| "rewards/cosine_scaled_reward": 0.09867796488106251, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3090.6458587646484, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.1303940713405609, | |
| "kl": 0.0007647275924682617, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0367, | |
| "reward": 0.04699781024828553, | |
| "reward_std": 0.8400920890271664, | |
| "rewards/cosine_scaled_reward": -0.015484219416975975, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2963.4584045410156, | |
| "epoch": 0.064, | |
| "grad_norm": 0.09380260854959488, | |
| "kl": 0.0018551349639892578, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0177, | |
| "reward": -0.06758141331374645, | |
| "reward_std": 0.6088844947516918, | |
| "rewards/cosine_scaled_reward": -0.0913094412535429, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3346.7708435058594, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.13119955360889435, | |
| "kl": 0.00034799426794052124, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0109, | |
| "reward": -0.14502036944031715, | |
| "reward_std": 0.866327153518796, | |
| "rewards/cosine_scaled_reward": -0.12297584302723408, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2402.6458740234375, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.11010728776454926, | |
| "kl": 0.00420612096786499, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0515, | |
| "reward": 0.10594136267900467, | |
| "reward_std": 0.6864449828863144, | |
| "rewards/cosine_scaled_reward": -0.09731632098555565, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3156.6458740234375, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.14981059730052948, | |
| "kl": 0.0015277080237865448, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0349, | |
| "reward": -0.15045391581952572, | |
| "reward_std": 0.6176320239901543, | |
| "rewards/cosine_scaled_reward": -0.09630896709859371, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3069.6041870117188, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.08052746206521988, | |
| "kl": 0.0005385726690292358, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0124, | |
| "reward": -0.26248200982809067, | |
| "reward_std": 0.5521546974778175, | |
| "rewards/cosine_scaled_reward": -0.19996057264506817, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3061.250030517578, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.1755966693162918, | |
| "kl": 0.0010443329811096191, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.106, | |
| "reward": 0.027474643662571907, | |
| "reward_std": 0.7631269134581089, | |
| "rewards/cosine_scaled_reward": -0.0676457080990076, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2801.791778564453, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.5502915382385254, | |
| "kl": 0.06185805797576904, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0777, | |
| "reward": 0.16424999572336674, | |
| "reward_std": 1.018867939710617, | |
| "rewards/cosine_scaled_reward": -0.02840923797339201, | |
| "rewards/format_reward": 0.5000000186264515, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2495.2708740234375, | |
| "epoch": 0.072, | |
| "grad_norm": 0.19492964446544647, | |
| "kl": 0.0023099184036254883, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.1135, | |
| "reward": 0.40039824694395065, | |
| "reward_std": 1.032276712357998, | |
| "rewards/cosine_scaled_reward": 0.06732268328778446, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 3155.354217529297, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.13117599487304688, | |
| "kl": 0.0020775794982910156, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0771, | |
| "reward": -0.09214674681425095, | |
| "reward_std": 0.7898851484060287, | |
| "rewards/cosine_scaled_reward": -0.06703834654763341, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2716.3542251586914, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.11173497885465622, | |
| "kl": 0.00382232666015625, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0204, | |
| "reward": -0.11022741347551346, | |
| "reward_std": 0.672077115625143, | |
| "rewards/cosine_scaled_reward": -0.14180615730583668, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2310.1041831970215, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.06708691269159317, | |
| "kl": 0.0025424957275390625, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0339, | |
| "reward": 0.20656662248075008, | |
| "reward_std": 0.589762119576335, | |
| "rewards/cosine_scaled_reward": 0.03981146775186062, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3528.8541870117188, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.07865083962678909, | |
| "kl": 0.0022742748260498047, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0142, | |
| "reward": -0.46846646815538406, | |
| "reward_std": 0.5137450993061066, | |
| "rewards/cosine_scaled_reward": -0.22528725676238537, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2388.875030517578, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.1428786665201187, | |
| "kl": 0.009924888610839844, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0722, | |
| "reward": 0.0902576670050621, | |
| "reward_std": 0.7360437363386154, | |
| "rewards/cosine_scaled_reward": -0.06687035039067268, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2723.7708740234375, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.08696511387825012, | |
| "kl": 0.004919290542602539, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0544, | |
| "reward": -0.24375398270785809, | |
| "reward_std": 0.5107810720801353, | |
| "rewards/cosine_scaled_reward": -0.21045764535665512, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3078.8125762939453, | |
| "epoch": 0.08, | |
| "grad_norm": 0.09773669391870499, | |
| "kl": 0.0020369887351989746, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0139, | |
| "reward": -0.0818065321072936, | |
| "reward_std": 0.6509491205215454, | |
| "rewards/cosine_scaled_reward": -0.11709374003112316, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2727.979202270508, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.07420142740011215, | |
| "kl": 0.004831179976463318, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.002, | |
| "reward": -0.03378719836473465, | |
| "reward_std": 0.6056636273860931, | |
| "rewards/cosine_scaled_reward": -0.07006076944526285, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 3157.750030517578, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.12406568974256516, | |
| "kl": 0.006371498107910156, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0799, | |
| "reward": -0.2358899898827076, | |
| "reward_std": 0.626086350530386, | |
| "rewards/cosine_scaled_reward": -0.14525874331593513, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3562.5625, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.07233923673629761, | |
| "kl": 0.0012898445129394531, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0053, | |
| "reward": -0.47873567789793015, | |
| "reward_std": 0.4625825770199299, | |
| "rewards/cosine_scaled_reward": -0.23070307821035385, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3244.3334045410156, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.1488964557647705, | |
| "kl": 0.003963470458984375, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0581, | |
| "reward": -0.0020135529339313507, | |
| "reward_std": 0.7993322685360909, | |
| "rewards/cosine_scaled_reward": -0.022909073159098625, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 3028.687545776367, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.08981792628765106, | |
| "kl": 0.005274057388305664, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0046, | |
| "reward": 0.03718305751681328, | |
| "reward_std": 0.6846093349158764, | |
| "rewards/cosine_scaled_reward": -0.010177075862884521, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2771.2083740234375, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.10527168214321136, | |
| "kl": 0.0019083023071289062, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0698, | |
| "reward": -0.13475606916472316, | |
| "reward_std": 0.6331770308315754, | |
| "rewards/cosine_scaled_reward": -0.19242265075445175, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3160.062545776367, | |
| "epoch": 0.088, | |
| "grad_norm": 0.053626783192157745, | |
| "kl": 0.0024300217628479004, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0089, | |
| "reward": -0.07912399154156446, | |
| "reward_std": 0.4063580594956875, | |
| "rewards/cosine_scaled_reward": -0.07036877004429698, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3157.354217529297, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.12693333625793457, | |
| "kl": 0.0019735097885131836, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.043, | |
| "reward": 0.10285145416855812, | |
| "reward_std": 0.8885178752243519, | |
| "rewards/cosine_scaled_reward": 0.01143704541027546, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2627.145866394043, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.0993918776512146, | |
| "kl": 0.005728721618652344, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0208, | |
| "reward": 0.005513674899702892, | |
| "reward_std": 0.696781549602747, | |
| "rewards/cosine_scaled_reward": -0.11277296394109726, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3378.6666870117188, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.11993207037448883, | |
| "kl": 0.0034465789794921875, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0148, | |
| "reward": -0.25228459760546684, | |
| "reward_std": 0.6576736122369766, | |
| "rewards/cosine_scaled_reward": -0.15029108710587025, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3124.9792098999023, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.07844506949186325, | |
| "kl": 0.0075626373291015625, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.013, | |
| "reward": -0.3477923655882478, | |
| "reward_std": 0.4749002642929554, | |
| "rewards/cosine_scaled_reward": -0.2027296293526888, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2722.375045776367, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.13039354979991913, | |
| "kl": 0.006313920021057129, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": -0.0104, | |
| "reward": 0.08498383313417435, | |
| "reward_std": 0.7605466395616531, | |
| "rewards/cosine_scaled_reward": -0.03437880612909794, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2761.1250228881836, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.11148078739643097, | |
| "kl": 0.006354331970214844, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0171, | |
| "reward": -0.1586984060704708, | |
| "reward_std": 0.6908692009747028, | |
| "rewards/cosine_scaled_reward": -0.14450077898800373, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3147.3333587646484, | |
| "epoch": 0.096, | |
| "grad_norm": 0.15051236748695374, | |
| "kl": 0.0017566680908203125, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0437, | |
| "reward": 0.20425852667540312, | |
| "reward_std": 0.8553840257227421, | |
| "rewards/cosine_scaled_reward": 0.06748985398371588, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3178.7500762939453, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.12316413968801498, | |
| "kl": 0.002705097198486328, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0373, | |
| "reward": -0.051826220005750656, | |
| "reward_std": 0.8340367153286934, | |
| "rewards/cosine_scaled_reward": -0.0864723757840693, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 3106.2083740234375, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.16358214616775513, | |
| "kl": 0.005809783935546875, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0502, | |
| "reward": -0.21086052944883704, | |
| "reward_std": 0.6448673270642757, | |
| "rewards/cosine_scaled_reward": -0.1603346224874258, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2988.3334045410156, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.18081220984458923, | |
| "kl": 0.006814241409301758, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0524, | |
| "reward": 0.13324597105383873, | |
| "reward_std": 0.8470934070646763, | |
| "rewards/cosine_scaled_reward": -0.020617252215743065, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2812.625030517578, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.1538206785917282, | |
| "kl": 0.012554168701171875, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0806, | |
| "reward": 0.1440363209694624, | |
| "reward_std": 0.7827430870383978, | |
| "rewards/cosine_scaled_reward": -0.009355325251817703, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3243.562515258789, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.10299122333526611, | |
| "kl": 0.004688262939453125, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0151, | |
| "reward": -0.2134521808475256, | |
| "reward_std": 0.6248220186680555, | |
| "rewards/cosine_scaled_reward": -0.12200713902711868, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2706.8958587646484, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.08340943604707718, | |
| "kl": 0.009927749633789062, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0087, | |
| "reward": -0.25001570768654346, | |
| "reward_std": 0.5442444123327732, | |
| "rewards/cosine_scaled_reward": -0.24867095332592726, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 2932.2500762939453, | |
| "epoch": 0.104, | |
| "grad_norm": 0.19951650500297546, | |
| "kl": 0.004284858703613281, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0723, | |
| "reward": 0.07665663212537766, | |
| "reward_std": 0.8412054367363453, | |
| "rewards/cosine_scaled_reward": -0.04515018220990896, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2869.812545776367, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.12959204614162445, | |
| "kl": 0.010074138641357422, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0231, | |
| "reward": 0.038971804082393646, | |
| "reward_std": 0.8026900477707386, | |
| "rewards/cosine_scaled_reward": -0.07580470014363527, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3530.5625, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.09351453185081482, | |
| "kl": 0.005786895751953125, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0137, | |
| "reward": -0.5460330247879028, | |
| "reward_std": 0.4978806171566248, | |
| "rewards/cosine_scaled_reward": -0.25744735077023506, | |
| "rewards/format_reward": 0.06250000186264515, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3159.645835876465, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.09856709837913513, | |
| "kl": 0.009381294250488281, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0331, | |
| "reward": -0.2638606168329716, | |
| "reward_std": 0.6384533829987049, | |
| "rewards/cosine_scaled_reward": -0.1464436650276184, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3420.812530517578, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.0972990095615387, | |
| "kl": 0.0027017593383789062, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0175, | |
| "reward": -0.29529019072651863, | |
| "reward_std": 0.6119959745556116, | |
| "rewards/cosine_scaled_reward": -0.18471521139144897, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 3003.437515258789, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.10754227638244629, | |
| "kl": 0.008491039276123047, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0294, | |
| "reward": -0.0011827312409877777, | |
| "reward_std": 0.7199659757316113, | |
| "rewards/cosine_scaled_reward": -0.056293437257409096, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3227.7708740234375, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.055975738912820816, | |
| "kl": 0.005445957183837891, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0114, | |
| "reward": -0.04315417259931564, | |
| "reward_std": 0.39706853218376637, | |
| "rewards/cosine_scaled_reward": -0.03440806642174721, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 3059.354217529297, | |
| "epoch": 0.112, | |
| "grad_norm": 0.10543321073055267, | |
| "kl": 0.005114555358886719, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0567, | |
| "reward": -0.14790542237460613, | |
| "reward_std": 0.6289958246052265, | |
| "rewards/cosine_scaled_reward": -0.12040873523801565, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2944.104217529297, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.14774614572525024, | |
| "kl": 0.007845878601074219, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0208, | |
| "reward": 0.024691712111234665, | |
| "reward_std": 0.8161002658307552, | |
| "rewards/cosine_scaled_reward": -0.026264889165759087, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 3003.562515258789, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.14759190380573273, | |
| "kl": 0.00844573974609375, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0498, | |
| "reward": -0.03464020788669586, | |
| "reward_std": 0.7968008350580931, | |
| "rewards/cosine_scaled_reward": -0.07347729802131653, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2660.7500228881836, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.05885668471455574, | |
| "kl": 0.004193544387817383, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0099, | |
| "reward": -0.024280589073896408, | |
| "reward_std": 0.478687334805727, | |
| "rewards/cosine_scaled_reward": -0.06905538472346961, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 3017.6459045410156, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.19236327707767487, | |
| "kl": 0.01230621337890625, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.1031, | |
| "reward": 0.0067907206248492, | |
| "reward_std": 0.7886966746300459, | |
| "rewards/cosine_scaled_reward": -0.07641204819083214, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 3141.437545776367, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.12005823105573654, | |
| "kl": 0.010993003845214844, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0418, | |
| "reward": 0.11525936797261238, | |
| "reward_std": 0.7872378453612328, | |
| "rewards/cosine_scaled_reward": -0.020046040415763855, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2757.520851135254, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.16546285152435303, | |
| "kl": 0.007067680358886719, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.029, | |
| "reward": -0.012922056019306183, | |
| "reward_std": 0.7183502614498138, | |
| "rewards/cosine_scaled_reward": -0.08642422500997782, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2874.68758392334, | |
| "epoch": 0.12, | |
| "grad_norm": 0.15343305468559265, | |
| "kl": 0.0062236785888671875, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": -0.0011, | |
| "reward": 0.3746343031525612, | |
| "reward_std": 0.9715399444103241, | |
| "rewards/cosine_scaled_reward": 0.13150912104174495, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2406.2292404174805, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.11538145691156387, | |
| "kl": 0.014267921447753906, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0512, | |
| "reward": 0.7112023187801242, | |
| "reward_std": 0.7698421813547611, | |
| "rewards/cosine_scaled_reward": 0.2695285137742758, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2906.083366394043, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.1356947422027588, | |
| "kl": 0.0071544647216796875, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0559, | |
| "reward": 0.04975247010588646, | |
| "reward_std": 0.6387073248624802, | |
| "rewards/cosine_scaled_reward": -0.04459162801504135, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2924.479202270508, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 1.227287769317627, | |
| "kl": 0.09261322021484375, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0411, | |
| "reward": -0.06936776265501976, | |
| "reward_std": 0.6040569245815277, | |
| "rewards/cosine_scaled_reward": -0.09396206960082054, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3159.4166870117188, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.06464666873216629, | |
| "kl": 0.0052165985107421875, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": -0.0001, | |
| "reward": -0.19677976984530687, | |
| "reward_std": 0.3907594494521618, | |
| "rewards/cosine_scaled_reward": -0.13640070147812366, | |
| "rewards/format_reward": 0.3125, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2941.979217529297, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.11352120339870453, | |
| "kl": 0.0075016021728515625, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0559, | |
| "reward": -0.02889834251254797, | |
| "reward_std": 0.6912853047251701, | |
| "rewards/cosine_scaled_reward": -0.09470331901684403, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3445.0625610351562, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.1718732863664627, | |
| "kl": 0.0095977783203125, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0525, | |
| "reward": -0.1273586554452777, | |
| "reward_std": 0.7887241318821907, | |
| "rewards/cosine_scaled_reward": -0.042100945487618446, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3372.2083740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.13645856082439423, | |
| "kl": 0.00563812255859375, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0184, | |
| "reward": -0.056365881115198135, | |
| "reward_std": 0.7170692086219788, | |
| "rewards/cosine_scaled_reward": -0.048214955255389214, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2884.791717529297, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.20523111522197723, | |
| "kl": 0.010105133056640625, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": -0.0033, | |
| "reward": 0.13258023280650377, | |
| "reward_std": 0.8612403385341167, | |
| "rewards/cosine_scaled_reward": -0.058843023143708706, | |
| "rewards/format_reward": 0.5416666809469461, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2625.458396911621, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.08083964139223099, | |
| "kl": 0.0072994232177734375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0276, | |
| "reward": -0.11776229925453663, | |
| "reward_std": 0.5373759977519512, | |
| "rewards/cosine_scaled_reward": -0.2312552430666983, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 3025.1041717529297, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.06019025295972824, | |
| "kl": 0.0069713592529296875, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0315, | |
| "reward": -0.2688702419400215, | |
| "reward_std": 0.44118795450776815, | |
| "rewards/cosine_scaled_reward": -0.14728060085326433, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3401.4166870117188, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.1261359304189682, | |
| "kl": 0.0067615509033203125, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0071, | |
| "reward": -0.318508867174387, | |
| "reward_std": 0.602899955585599, | |
| "rewards/cosine_scaled_reward": -0.16228950582444668, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3060.7916717529297, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.0866546705365181, | |
| "kl": 0.008612632751464844, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0005, | |
| "reward": -0.3146707344567403, | |
| "reward_std": 0.5091048590838909, | |
| "rewards/cosine_scaled_reward": -0.2149065202102065, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 3120.0833435058594, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.18219080567359924, | |
| "kl": 0.00536346435546875, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0522, | |
| "reward": 0.4105420224368572, | |
| "reward_std": 1.0126653835177422, | |
| "rewards/cosine_scaled_reward": 0.17556388210505247, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2533.9167251586914, | |
| "epoch": 0.136, | |
| "grad_norm": 0.1974111646413803, | |
| "kl": 0.054787635803222656, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0693, | |
| "reward": 0.0521804504096508, | |
| "reward_std": 0.7259144820272923, | |
| "rewards/cosine_scaled_reward": -0.07517153583467007, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2534.4167137145996, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.11178319901227951, | |
| "kl": 0.009916305541992188, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0475, | |
| "reward": 0.0007900348864495754, | |
| "reward_std": 0.6627710647881031, | |
| "rewards/cosine_scaled_reward": -0.09777611820027232, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 2313.229248046875, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.16405229270458221, | |
| "kl": 0.013139724731445312, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0402, | |
| "reward": 0.3239856115542352, | |
| "reward_std": 0.7424066159874201, | |
| "rewards/cosine_scaled_reward": -0.005178395658731461, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2761.8958587646484, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.12051185965538025, | |
| "kl": 0.0068817138671875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.026, | |
| "reward": 0.3165215402841568, | |
| "reward_std": 0.7121441401541233, | |
| "rewards/cosine_scaled_reward": 0.07533489167690277, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 3158.104217529297, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.1131911501288414, | |
| "kl": 0.007569313049316406, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0551, | |
| "reward": -0.16298508271574974, | |
| "reward_std": 0.7209546975791454, | |
| "rewards/cosine_scaled_reward": -0.13522059097886086, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2517.312530517578, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.12443465739488602, | |
| "kl": 0.009019851684570312, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": -0.0164, | |
| "reward": 0.19143125228583813, | |
| "reward_std": 0.8426807429641485, | |
| "rewards/cosine_scaled_reward": -0.0331303218845278, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2815.5833740234375, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.06811921298503876, | |
| "kl": 0.0053272247314453125, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0266, | |
| "reward": -0.04552587552461773, | |
| "reward_std": 0.594413885846734, | |
| "rewards/cosine_scaled_reward": -0.050011674873530865, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2926.750045776367, | |
| "epoch": 0.144, | |
| "grad_norm": 0.11453459411859512, | |
| "kl": 0.005611419677734375, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0231, | |
| "reward": -0.027173910290002823, | |
| "reward_std": 0.6515960693359375, | |
| "rewards/cosine_scaled_reward": -0.11056716740131378, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 3180.0416870117188, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.13321727514266968, | |
| "kl": 0.008800506591796875, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0247, | |
| "reward": -0.2067241296172142, | |
| "reward_std": 0.7376469634473324, | |
| "rewards/cosine_scaled_reward": -0.1875669350847602, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2854.145866394043, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.13030071556568146, | |
| "kl": 0.008055686950683594, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0297, | |
| "reward": 0.28113045543432236, | |
| "reward_std": 0.7527024820446968, | |
| "rewards/cosine_scaled_reward": 0.08081426518037915, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3468.4583435058594, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.09160865843296051, | |
| "kl": 0.009267807006835938, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0182, | |
| "reward": -0.3858581744134426, | |
| "reward_std": 0.4741293340921402, | |
| "rewards/cosine_scaled_reward": -0.17545855604112148, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3145.2083435058594, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.11552103608846664, | |
| "kl": 0.008371353149414062, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0555, | |
| "reward": -0.1405576877295971, | |
| "reward_std": 0.6180392988026142, | |
| "rewards/cosine_scaled_reward": -0.08423710052738898, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2990.750011444092, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.1343425065279007, | |
| "kl": 0.010997772216796875, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0573, | |
| "reward": -0.0012444127351045609, | |
| "reward_std": 0.7613321915268898, | |
| "rewards/cosine_scaled_reward": -0.0183742493391037, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2849.229217529297, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.1754404902458191, | |
| "kl": 0.007671356201171875, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0039, | |
| "reward": 0.15461664367467165, | |
| "reward_std": 1.0978601425886154, | |
| "rewards/cosine_scaled_reward": -0.016693929443135858, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3204.5208740234375, | |
| "epoch": 0.152, | |
| "grad_norm": 0.09437424689531326, | |
| "kl": 0.010120391845703125, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0371, | |
| "reward": -0.3896838743239641, | |
| "reward_std": 0.46856095641851425, | |
| "rewards/cosine_scaled_reward": -0.23207763396203518, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2440.8125534057617, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.11227884888648987, | |
| "kl": 0.010713577270507812, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0059, | |
| "reward": 0.03150389529764652, | |
| "reward_std": 0.6699380800127983, | |
| "rewards/cosine_scaled_reward": -0.10194769129157066, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2048.7083740234375, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.11956455558538437, | |
| "kl": 0.009142875671386719, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0417, | |
| "reward": 0.611090637743473, | |
| "reward_std": 0.8544140718877316, | |
| "rewards/cosine_scaled_reward": 0.19521427806466818, | |
| "rewards/format_reward": 0.6875, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 3031.7708587646484, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.16711600124835968, | |
| "kl": 0.012142181396484375, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0697, | |
| "reward": 0.3044010065495968, | |
| "reward_std": 0.8896326720714569, | |
| "rewards/cosine_scaled_reward": 0.08019791916012764, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 3144.291717529297, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.1869216412305832, | |
| "kl": 0.01092529296875, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0486, | |
| "reward": -0.18480181868653744, | |
| "reward_std": 0.7662193942815065, | |
| "rewards/cosine_scaled_reward": -0.14503308382700197, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2823.6667251586914, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.1232113316655159, | |
| "kl": 0.010227203369140625, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0399, | |
| "reward": -0.14219614176545292, | |
| "reward_std": 0.6205537635833025, | |
| "rewards/cosine_scaled_reward": -0.1589539386332035, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 3272.187530517578, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.15304112434387207, | |
| "kl": 0.0121917724609375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0143, | |
| "reward": -0.028373660519719124, | |
| "reward_std": 0.7605781219899654, | |
| "rewards/cosine_scaled_reward": -0.07536444254219532, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 3336.541717529297, | |
| "epoch": 0.16, | |
| "grad_norm": 0.1725020408630371, | |
| "kl": 0.018383026123046875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0185, | |
| "reward": -0.09492362570017576, | |
| "reward_std": 0.739283999428153, | |
| "rewards/cosine_scaled_reward": -0.052022709511220455, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2818.7500915527344, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.1428345888853073, | |
| "kl": 0.014141082763671875, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0428, | |
| "reward": 0.17010034061968327, | |
| "reward_std": 0.8265232257544994, | |
| "rewards/cosine_scaled_reward": -0.03258664230816066, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2865.041717529297, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.132287859916687, | |
| "kl": 0.012800216674804688, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": -0.0266, | |
| "reward": 0.14131132513284683, | |
| "reward_std": 0.6835053861141205, | |
| "rewards/cosine_scaled_reward": -0.061369335278868675, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2789.979217529297, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2371223270893097, | |
| "kl": 0.01451873779296875, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0726, | |
| "reward": -0.21053062099963427, | |
| "reward_std": 0.6688540205359459, | |
| "rewards/cosine_scaled_reward": -0.22842608066275716, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 3133.833366394043, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.14418281614780426, | |
| "kl": 0.0147552490234375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0211, | |
| "reward": -0.19076000433415174, | |
| "reward_std": 0.8512778505682945, | |
| "rewards/cosine_scaled_reward": -0.1536552112083882, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2111.166679382324, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.08913204073905945, | |
| "kl": 0.011333465576171875, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0067, | |
| "reward": 0.37262871488928795, | |
| "reward_std": 0.6370398961007595, | |
| "rewards/cosine_scaled_reward": 0.05427891947329044, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2267.708351135254, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.09961648285388947, | |
| "kl": 0.008184432983398438, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0193, | |
| "reward": 0.021780904848128557, | |
| "reward_std": 0.629846852272749, | |
| "rewards/cosine_scaled_reward": -0.18135410267859697, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2814.354202270508, | |
| "epoch": 0.168, | |
| "grad_norm": 0.1152271032333374, | |
| "kl": 0.015193939208984375, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": -0.0113, | |
| "reward": 0.11335810273885727, | |
| "reward_std": 0.75190694257617, | |
| "rewards/cosine_scaled_reward": -0.01303301053121686, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2681.6458892822266, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.10002895444631577, | |
| "kl": 0.0156402587890625, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0397, | |
| "reward": 0.015451362356543541, | |
| "reward_std": 0.6257824674248695, | |
| "rewards/cosine_scaled_reward": -0.08100172178819776, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2944.9375610351562, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.11802194267511368, | |
| "kl": 0.012241363525390625, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0548, | |
| "reward": -0.029212953057140112, | |
| "reward_std": 0.5978215262293816, | |
| "rewards/cosine_scaled_reward": -0.11697033792734146, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 3025.500045776367, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.158779576420784, | |
| "kl": 0.016357421875, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.075, | |
| "reward": 0.17186034470796585, | |
| "reward_std": 0.9030736871063709, | |
| "rewards/cosine_scaled_reward": 0.03854179289191961, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2807.2709045410156, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.2797929644584656, | |
| "kl": 0.016078948974609375, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.1148, | |
| "reward": 0.35514865489676595, | |
| "reward_std": 1.013380728662014, | |
| "rewards/cosine_scaled_reward": 0.09170010522939265, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2813.312511444092, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.15241533517837524, | |
| "kl": 0.016437530517578125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0504, | |
| "reward": -0.21058875182643533, | |
| "reward_std": 0.6285759322345257, | |
| "rewards/cosine_scaled_reward": -0.17488694563508034, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2995.937515258789, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.09293405711650848, | |
| "kl": 0.022808074951171875, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0405, | |
| "reward": -0.2887796126306057, | |
| "reward_std": 0.4931412860751152, | |
| "rewards/cosine_scaled_reward": -0.21798867918550968, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3390.1458435058594, | |
| "epoch": 0.176, | |
| "grad_norm": 0.17047946155071259, | |
| "kl": 0.01389312744140625, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0042, | |
| "reward": 0.22278353199362755, | |
| "reward_std": 0.9290903359651566, | |
| "rewards/cosine_scaled_reward": 0.07489011948928237, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2628.625030517578, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.16334204375743866, | |
| "kl": 0.01788330078125, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0719, | |
| "reward": 0.07151136547327042, | |
| "reward_std": 0.850743044167757, | |
| "rewards/cosine_scaled_reward": -0.05793104809708893, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 3115.0208740234375, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.1153322234749794, | |
| "kl": 0.015087127685546875, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0422, | |
| "reward": -0.11803282611072063, | |
| "reward_std": 0.6224103905260563, | |
| "rewards/cosine_scaled_reward": -0.07339339889585972, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 3112.3333740234375, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.11238392442464828, | |
| "kl": 0.018207550048828125, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0267, | |
| "reward": -0.12678863108158112, | |
| "reward_std": 0.6108014769852161, | |
| "rewards/cosine_scaled_reward": -0.13557185977697372, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 3134.9375610351562, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.14315029978752136, | |
| "kl": 0.016357421875, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0291, | |
| "reward": 0.31565938144922256, | |
| "reward_std": 0.7341890074312687, | |
| "rewards/cosine_scaled_reward": 0.12421234138309956, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 3197.2916870117188, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.08521895110607147, | |
| "kl": 0.02240753173828125, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": -0.0065, | |
| "reward": -0.2289612852036953, | |
| "reward_std": 0.45951012521982193, | |
| "rewards/cosine_scaled_reward": -0.15268473327159882, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2835.1250076293945, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.1391443908214569, | |
| "kl": 0.0212249755859375, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0389, | |
| "reward": -0.09338995814323425, | |
| "reward_std": 0.6676197461783886, | |
| "rewards/cosine_scaled_reward": -0.08398129511624575, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2894.2500762939453, | |
| "epoch": 0.184, | |
| "grad_norm": 0.15109732747077942, | |
| "kl": 0.022735595703125, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0618, | |
| "reward": 0.18475250899791718, | |
| "reward_std": 0.7132500857114792, | |
| "rewards/cosine_scaled_reward": 0.038985077291727066, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3240.104217529297, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.25355350971221924, | |
| "kl": 0.02893829345703125, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.03, | |
| "reward": -0.08004653453826904, | |
| "reward_std": 0.7777004204690456, | |
| "rewards/cosine_scaled_reward": -0.050195490941405296, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2491.4166870117188, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.10477182269096375, | |
| "kl": 0.019779205322265625, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0184, | |
| "reward": 0.30021179956384003, | |
| "reward_std": 0.5858964845538139, | |
| "rewards/cosine_scaled_reward": 0.05349706672132015, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2634.3334045410156, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.10012540221214294, | |
| "kl": 0.02051544189453125, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0372, | |
| "reward": 0.18599897995591164, | |
| "reward_std": 0.5746248178184032, | |
| "rewards/cosine_scaled_reward": 0.004582956433296204, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 3389.854217529297, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.17076270282268524, | |
| "kl": 0.0308380126953125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0336, | |
| "reward": -0.19457256980240345, | |
| "reward_std": 0.8124973215162754, | |
| "rewards/cosine_scaled_reward": -0.14226967841386795, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 3060.5416870117188, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.1075100377202034, | |
| "kl": 0.01813507080078125, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0452, | |
| "reward": -0.002260749228298664, | |
| "reward_std": 0.5852988548576832, | |
| "rewards/cosine_scaled_reward": -0.04963597096502781, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2668.5625610351562, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.17578865587711334, | |
| "kl": 0.0214080810546875, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0434, | |
| "reward": -0.03433691617101431, | |
| "reward_std": 0.7491964735090733, | |
| "rewards/cosine_scaled_reward": -0.18043148797005415, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 3086.2083892822266, | |
| "epoch": 0.192, | |
| "grad_norm": 0.1612425446510315, | |
| "kl": 0.02228546142578125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0382, | |
| "reward": 0.3381266240030527, | |
| "reward_std": 0.7707905732095242, | |
| "rewards/cosine_scaled_reward": 0.10119387321174145, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2517.1041946411133, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.16143831610679626, | |
| "kl": 0.025909423828125, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0434, | |
| "reward": 0.7395636513829231, | |
| "reward_std": 0.8513101674616337, | |
| "rewards/cosine_scaled_reward": 0.3049415610730648, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2514.916702270508, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.10102265328168869, | |
| "kl": 0.0198974609375, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0574, | |
| "reward": 0.13654466345906258, | |
| "reward_std": 0.6086900364607573, | |
| "rewards/cosine_scaled_reward": 0.003594242036342621, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2985.0208740234375, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.13056401908397675, | |
| "kl": 0.0213165283203125, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0198, | |
| "reward": 0.062067726626992226, | |
| "reward_std": 0.7003943808376789, | |
| "rewards/cosine_scaled_reward": -0.022239719983190298, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 3023.25, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.13006411492824554, | |
| "kl": 0.027801513671875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0166, | |
| "reward": -0.10689089074730873, | |
| "reward_std": 0.7168557345867157, | |
| "rewards/cosine_scaled_reward": -0.0742192417383194, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2487.312572479248, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.15723471343517303, | |
| "kl": 0.029430389404296875, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0449, | |
| "reward": 0.07556262612342834, | |
| "reward_std": 0.7289031557738781, | |
| "rewards/cosine_scaled_reward": -0.11148698627948761, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2791.6459045410156, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.12100586295127869, | |
| "kl": 0.03168487548828125, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0243, | |
| "reward": 0.15438630525022745, | |
| "reward_std": 0.6935141123831272, | |
| "rewards/cosine_scaled_reward": -0.043977076187729836, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2961.2083892822266, | |
| "epoch": 0.2, | |
| "grad_norm": 0.10910208523273468, | |
| "kl": 0.027740478515625, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0207, | |
| "reward": 0.0747014251537621, | |
| "reward_std": 0.6700296718627214, | |
| "rewards/cosine_scaled_reward": -0.031223440542817116, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2736.104217529297, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.22644685208797455, | |
| "kl": 0.02967071533203125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0671, | |
| "reward": 0.0621003326959908, | |
| "reward_std": 0.8631531074643135, | |
| "rewards/cosine_scaled_reward": -0.06792039083666168, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 3160.0000610351562, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.23566588759422302, | |
| "kl": 0.03285980224609375, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0637, | |
| "reward": 0.023158524534665048, | |
| "reward_std": 0.7583435364067554, | |
| "rewards/cosine_scaled_reward": -0.10049578547477722, | |
| "rewards/format_reward": 0.5000000186264515, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2873.854232788086, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.19544903934001923, | |
| "kl": 0.0418548583984375, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0085, | |
| "reward": 0.14865556359291077, | |
| "reward_std": 0.8756407424807549, | |
| "rewards/cosine_scaled_reward": 0.016698965802788734, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 3137.1666870117188, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.07909320294857025, | |
| "kl": 0.03249359130859375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0049, | |
| "reward": -0.36861060559749603, | |
| "reward_std": 0.3870035018771887, | |
| "rewards/cosine_scaled_reward": -0.24457938224077225, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2372.270866394043, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.14237825572490692, | |
| "kl": 0.03276824951171875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0399, | |
| "reward": 0.31979808397591114, | |
| "reward_std": 0.7863202355802059, | |
| "rewards/cosine_scaled_reward": 0.06971612432971597, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 3232.3958892822266, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.11890272796154022, | |
| "kl": 0.0372467041015625, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0285, | |
| "reward": -0.08456094935536385, | |
| "reward_std": 0.6224741712212563, | |
| "rewards/cosine_scaled_reward": -0.07428957521915436, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2675.750045776367, | |
| "epoch": 0.208, | |
| "grad_norm": 0.1544029265642166, | |
| "kl": 0.026336669921875, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0012, | |
| "reward": 0.28332219598814845, | |
| "reward_std": 0.8586058430373669, | |
| "rewards/cosine_scaled_reward": 0.04892569035291672, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 2722.3334045410156, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.2383163869380951, | |
| "kl": 0.0434417724609375, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0422, | |
| "reward": 0.37364979088306427, | |
| "reward_std": 0.7791223339736462, | |
| "rewards/cosine_scaled_reward": 0.08836854621767998, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2838.3125534057617, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.08796233683824539, | |
| "kl": 0.037139892578125, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0153, | |
| "reward": -0.2598969964310527, | |
| "reward_std": 0.48821557871997356, | |
| "rewards/cosine_scaled_reward": -0.2099758218973875, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2718.395851135254, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.14351139962673187, | |
| "kl": 0.0391998291015625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0168, | |
| "reward": -0.022745168476831168, | |
| "reward_std": 0.6752897650003433, | |
| "rewards/cosine_scaled_reward": -0.14890163764357567, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 3067.541717529297, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.10349977016448975, | |
| "kl": 0.038116455078125, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0083, | |
| "reward": 0.06632867828011513, | |
| "reward_std": 0.6240403186529875, | |
| "rewards/cosine_scaled_reward": 0.0068851374089717865, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2420.291702270508, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.15342004597187042, | |
| "kl": 0.046356201171875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0298, | |
| "reward": 0.08685800805687904, | |
| "reward_std": 0.6128952912986279, | |
| "rewards/cosine_scaled_reward": -0.11734730005264282, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 3567.3958435058594, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.15315963327884674, | |
| "kl": 0.05072021484375, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0074, | |
| "reward": -0.398057883605361, | |
| "reward_std": 0.6201205141842365, | |
| "rewards/cosine_scaled_reward": -0.18498908844776452, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 3156.937545776367, | |
| "epoch": 0.216, | |
| "grad_norm": 0.22657519578933716, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0596, | |
| "reward": 0.08714086917461827, | |
| "reward_std": 0.7904996033757925, | |
| "rewards/cosine_scaled_reward": -0.008101928047835827, | |
| "rewards/format_reward": 0.3958333469927311, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2789.041702270508, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.13739848136901855, | |
| "kl": 0.051605224609375, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0128, | |
| "reward": 0.15377038344740868, | |
| "reward_std": 0.7078918963670731, | |
| "rewards/cosine_scaled_reward": -0.0018926504999399185, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2624.083366394043, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.17836546897888184, | |
| "kl": 0.05645751953125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0403, | |
| "reward": 0.17610792024061084, | |
| "reward_std": 0.7685041464865208, | |
| "rewards/cosine_scaled_reward": 0.013859146274626255, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 3288.916717529297, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.20157144963741302, | |
| "kl": 0.053802490234375, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0343, | |
| "reward": -0.18958128988742828, | |
| "reward_std": 0.6780943684279919, | |
| "rewards/cosine_scaled_reward": -0.14980729576200247, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 3114.1458740234375, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.18799538910388947, | |
| "kl": 0.0482025146484375, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0323, | |
| "reward": -0.24548013135790825, | |
| "reward_std": 0.6712213661521673, | |
| "rewards/cosine_scaled_reward": -0.1577296955510974, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 3140.9375610351562, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.27719372510910034, | |
| "kl": 0.0517578125, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0627, | |
| "reward": 0.3136282116174698, | |
| "reward_std": 1.0058045387268066, | |
| "rewards/cosine_scaled_reward": 0.1217548530548811, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 3052.187545776367, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.2891777753829956, | |
| "kl": 0.058441162109375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0488, | |
| "reward": -0.09272150322794914, | |
| "reward_std": 0.6414213795214891, | |
| "rewards/cosine_scaled_reward": -0.07150299660861492, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3178.437530517578, | |
| "epoch": 0.224, | |
| "grad_norm": 0.1800169199705124, | |
| "kl": 0.06561279296875, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0051, | |
| "reward": -0.12664931640028954, | |
| "reward_std": 0.584855318069458, | |
| "rewards/cosine_scaled_reward": -0.08012674562633038, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2523.6667404174805, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.29900649189949036, | |
| "kl": 0.08282470703125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0404, | |
| "reward": 0.32191772386431694, | |
| "reward_std": 1.1363152042031288, | |
| "rewards/cosine_scaled_reward": 0.052287210419308394, | |
| "rewards/format_reward": 0.5416666809469461, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2786.625045776367, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.34645184874534607, | |
| "kl": 0.10394287109375, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0284, | |
| "reward": -0.09496624395251274, | |
| "reward_std": 0.6138490363955498, | |
| "rewards/cosine_scaled_reward": -0.11393817700445652, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2917.729217529297, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.27866116166114807, | |
| "kl": 0.08343505859375, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0601, | |
| "reward": -0.2887334353290498, | |
| "reward_std": 0.6878443285822868, | |
| "rewards/cosine_scaled_reward": -0.18037102394737303, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 2416.0833587646484, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.29420676827430725, | |
| "kl": 0.07171630859375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0608, | |
| "reward": 0.38031474966555834, | |
| "reward_std": 0.8896522857248783, | |
| "rewards/cosine_scaled_reward": 0.08112852554768324, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2426.1667098999023, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.3698631525039673, | |
| "kl": 0.069580078125, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0396, | |
| "reward": 0.5210681445896626, | |
| "reward_std": 0.8631033673882484, | |
| "rewards/cosine_scaled_reward": 0.18454162776470184, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 2384.5416946411133, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.21861740946769714, | |
| "kl": 0.09124755859375, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0316, | |
| "reward": 0.4668930694460869, | |
| "reward_std": 0.7846425659954548, | |
| "rewards/cosine_scaled_reward": 0.17546686669811606, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 3118.041717529297, | |
| "epoch": 0.232, | |
| "grad_norm": 0.32266831398010254, | |
| "kl": 0.115875244140625, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.058, | |
| "reward": 0.050868467427790165, | |
| "reward_std": 0.8579942099750042, | |
| "rewards/cosine_scaled_reward": 0.037848809035494924, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 2304.9583587646484, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.18206676840782166, | |
| "kl": 0.109619140625, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0048, | |
| "reward": 0.30574227310717106, | |
| "reward_std": 0.6170836389064789, | |
| "rewards/cosine_scaled_reward": -0.008607452735304832, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2729.062545776367, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.3828980624675751, | |
| "kl": 0.11627197265625, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0304, | |
| "reward": 0.5650580562651157, | |
| "reward_std": 1.018170591443777, | |
| "rewards/cosine_scaled_reward": 0.22851963574066758, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 3118.2500534057617, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.38659653067588806, | |
| "kl": 0.12823486328125, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0409, | |
| "reward": -0.19641774892807007, | |
| "reward_std": 0.7758117392659187, | |
| "rewards/cosine_scaled_reward": -0.16738836327567697, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 3061.0000610351562, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.5246978998184204, | |
| "kl": 0.1700439453125, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0753, | |
| "reward": -0.06954369135200977, | |
| "reward_std": 0.7209791392087936, | |
| "rewards/cosine_scaled_reward": -0.15650581319641788, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2615.5833587646484, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.590552568435669, | |
| "kl": 0.137298583984375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0517, | |
| "reward": 0.10035054851323366, | |
| "reward_std": 0.6904583033174276, | |
| "rewards/cosine_scaled_reward": -0.06608795002102852, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2907.0625610351562, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.3516550660133362, | |
| "kl": 0.1842041015625, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0368, | |
| "reward": 0.19722715765237808, | |
| "reward_std": 0.7542031668126583, | |
| "rewards/cosine_scaled_reward": 0.045978354290127754, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2872.8333740234375, | |
| "epoch": 0.24, | |
| "grad_norm": 0.31704995036125183, | |
| "kl": 0.17333984375, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0472, | |
| "reward": 0.005523813422769308, | |
| "reward_std": 0.6337867602705956, | |
| "rewards/cosine_scaled_reward": -0.05793424462899566, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2413.104202270508, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.30833762884140015, | |
| "kl": 0.18377685546875, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0023, | |
| "reward": 0.17130140773952007, | |
| "reward_std": 0.6732404362410307, | |
| "rewards/cosine_scaled_reward": -0.004216981120407581, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 2611.0000762939453, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.36040380597114563, | |
| "kl": 0.195404052734375, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0443, | |
| "reward": 0.207231349311769, | |
| "reward_std": 0.8025440499186516, | |
| "rewards/cosine_scaled_reward": 0.004049690440297127, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 2825.479217529297, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.4717042148113251, | |
| "kl": 0.2646484375, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0311, | |
| "reward": 0.24717780202627182, | |
| "reward_std": 0.9448489658534527, | |
| "rewards/cosine_scaled_reward": 0.05484255403280258, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2770.4375762939453, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.5870217084884644, | |
| "kl": 0.2110595703125, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0885, | |
| "reward": 0.2060818038880825, | |
| "reward_std": 0.8542167469859123, | |
| "rewards/cosine_scaled_reward": 0.04777835123240948, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 2372.354217529297, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.2540161609649658, | |
| "kl": 0.204833984375, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0192, | |
| "reward": -0.08897280413657427, | |
| "reward_std": 0.5330733954906464, | |
| "rewards/cosine_scaled_reward": -0.18874458596110344, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 2626.2083892822266, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.38485583662986755, | |
| "kl": 0.2821044921875, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0632, | |
| "reward": 0.3148540537804365, | |
| "reward_std": 0.8362242169678211, | |
| "rewards/cosine_scaled_reward": 0.04583857045508921, | |
| "rewards/format_reward": 0.5833333469927311, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 3083.104217529297, | |
| "epoch": 0.248, | |
| "grad_norm": 0.35301655530929565, | |
| "kl": 0.3206787109375, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0247, | |
| "reward": 0.1059049442410469, | |
| "reward_std": 0.8536710906773806, | |
| "rewards/cosine_scaled_reward": -0.003996940446086228, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2729.8125610351562, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.3946671783924103, | |
| "kl": 0.2547607421875, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0153, | |
| "reward": -0.2474758685566485, | |
| "reward_std": 0.6462593208998442, | |
| "rewards/cosine_scaled_reward": -0.22353118157479912, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 2773.6250610351562, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.5205709338188171, | |
| "kl": 0.28680419921875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0678, | |
| "reward": 0.17035892861895263, | |
| "reward_std": 0.8409441113471985, | |
| "rewards/cosine_scaled_reward": -0.03153566690161824, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 3064.9792404174805, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.3793766498565674, | |
| "kl": 0.3310546875, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0536, | |
| "reward": -0.40579412039369345, | |
| "reward_std": 0.4997274577617645, | |
| "rewards/cosine_scaled_reward": -0.2578846197575331, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2607.375045776367, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.4927677810192108, | |
| "kl": 0.2911376953125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0538, | |
| "reward": 0.29882943257689476, | |
| "reward_std": 0.8470067903399467, | |
| "rewards/cosine_scaled_reward": 0.015899650752544403, | |
| "rewards/format_reward": 0.625000013038516, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 2744.041748046875, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.3646686375141144, | |
| "kl": 0.331298828125, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0247, | |
| "reward": 0.22724982630461454, | |
| "reward_std": 0.6808402426540852, | |
| "rewards/cosine_scaled_reward": -0.022449948824942112, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 2798.2083740234375, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.397216260433197, | |
| "kl": 0.3436279296875, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0215, | |
| "reward": 0.16939541138708591, | |
| "reward_std": 0.5345852337777615, | |
| "rewards/cosine_scaled_reward": 0.007962611503899097, | |
| "rewards/format_reward": 0.5, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 3249.1666870117188, | |
| "epoch": 0.256, | |
| "grad_norm": 0.7151598930358887, | |
| "kl": 0.38037109375, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0902, | |
| "reward": -0.05321236699819565, | |
| "reward_std": 0.8325384929776192, | |
| "rewards/cosine_scaled_reward": -0.046919144690036774, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2881.2709350585938, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.412383109331131, | |
| "kl": 0.35223388671875, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0437, | |
| "reward": 0.012765285558998585, | |
| "reward_std": 0.6845123060047626, | |
| "rewards/cosine_scaled_reward": -0.08529938757419586, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2802.041717529297, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.4051804840564728, | |
| "kl": 0.37164306640625, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0577, | |
| "reward": 0.17664484679698944, | |
| "reward_std": 0.7071616277098656, | |
| "rewards/cosine_scaled_reward": 0.03299903869628906, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 2718.291732788086, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.4118206799030304, | |
| "kl": 0.400390625, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0558, | |
| "reward": 0.059418462216854095, | |
| "reward_std": 0.6883127726614475, | |
| "rewards/cosine_scaled_reward": -0.16530904080718756, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 2551.270866394043, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.5437552332878113, | |
| "kl": 0.3446044921875, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0212, | |
| "reward": 0.16873644525185227, | |
| "reward_std": 0.6643664948642254, | |
| "rewards/cosine_scaled_reward": -0.00022228434681892395, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 3142.5208740234375, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.824421226978302, | |
| "kl": 0.490966796875, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.022, | |
| "reward": -0.009648986160755157, | |
| "reward_std": 0.523324097506702, | |
| "rewards/cosine_scaled_reward": -0.06068984791636467, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 3071.4583740234375, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.5922899842262268, | |
| "kl": 0.43035888671875, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0518, | |
| "reward": -0.29550562985241413, | |
| "reward_std": 0.6693998202681541, | |
| "rewards/cosine_scaled_reward": -0.22350211441516876, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2912.7083740234375, | |
| "epoch": 0.264, | |
| "grad_norm": 0.5167642831802368, | |
| "kl": 0.3560791015625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0283, | |
| "reward": 0.1596711277961731, | |
| "reward_std": 0.7965347096323967, | |
| "rewards/cosine_scaled_reward": -0.01580745540559292, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 3158.1459045410156, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.6749157905578613, | |
| "kl": 0.4017333984375, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0837, | |
| "reward": -0.297568422742188, | |
| "reward_std": 0.6748274452984333, | |
| "rewards/cosine_scaled_reward": -0.18697709869593382, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2762.1666870117188, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.3311655819416046, | |
| "kl": 0.317626953125, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0416, | |
| "reward": -0.08082350715994835, | |
| "reward_std": 0.6287317145615816, | |
| "rewards/cosine_scaled_reward": -0.16008687764406204, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2734.8750648498535, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.2763339579105377, | |
| "kl": 0.291839599609375, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0202, | |
| "reward": -0.14287907630205154, | |
| "reward_std": 0.5912873484194279, | |
| "rewards/cosine_scaled_reward": -0.13689745217561722, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 2442.187515258789, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.324736088514328, | |
| "kl": 0.25311279296875, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0321, | |
| "reward": -0.05069837532937527, | |
| "reward_std": 0.6721667312085629, | |
| "rewards/cosine_scaled_reward": -0.173017387278378, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2971.2083740234375, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.6796517968177795, | |
| "kl": 0.234130859375, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0103, | |
| "reward": 0.2688827021047473, | |
| "reward_std": 0.9795111119747162, | |
| "rewards/cosine_scaled_reward": -0.004769146267790347, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 2812.8750610351562, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.9011460542678833, | |
| "kl": 0.2601318359375, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0936, | |
| "reward": 0.06308133527636528, | |
| "reward_std": 0.7535826116800308, | |
| "rewards/cosine_scaled_reward": -0.068068141117692, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 2691.7084045410156, | |
| "epoch": 0.272, | |
| "grad_norm": 1.501541018486023, | |
| "kl": 0.273681640625, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0986, | |
| "reward": 0.32719678059220314, | |
| "reward_std": 1.0173063725233078, | |
| "rewards/cosine_scaled_reward": 0.05737040005624294, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 2189.7708587646484, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.20339642465114594, | |
| "kl": 0.24578857421875, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0259, | |
| "reward": 0.3876400392036885, | |
| "reward_std": 0.6854967623949051, | |
| "rewards/cosine_scaled_reward": 0.07331139780580997, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2928.354232788086, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.46207401156425476, | |
| "kl": 0.510498046875, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0666, | |
| "reward": -0.031538160517811775, | |
| "reward_std": 0.6143174581229687, | |
| "rewards/cosine_scaled_reward": -0.12371946685016155, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 3160.937545776367, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.4524831175804138, | |
| "kl": 0.5111083984375, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0504, | |
| "reward": -0.24879638850688934, | |
| "reward_std": 0.6495300643146038, | |
| "rewards/cosine_scaled_reward": -0.22295443713665009, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 2566.5000610351562, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.578352153301239, | |
| "kl": 0.4998779296875, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0285, | |
| "reward": 0.15099805174395442, | |
| "reward_std": 0.515793077647686, | |
| "rewards/cosine_scaled_reward": -0.11855055205523968, | |
| "rewards/format_reward": 0.7291666809469461, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 2909.3958587646484, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.8137231469154358, | |
| "kl": 0.51025390625, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0269, | |
| "reward": 0.334348788484931, | |
| "reward_std": 0.7821071408689022, | |
| "rewards/cosine_scaled_reward": 0.07382349669933319, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 2917.541748046875, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.6295875906944275, | |
| "kl": 0.50042724609375, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0786, | |
| "reward": 0.2729534022510052, | |
| "reward_std": 0.830750398337841, | |
| "rewards/cosine_scaled_reward": 0.08813249412924051, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 3121.8958740234375, | |
| "epoch": 0.28, | |
| "grad_norm": 1.1297352313995361, | |
| "kl": 0.5673828125, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.12, | |
| "reward": 0.08920413255691528, | |
| "reward_std": 1.0364465415477753, | |
| "rewards/cosine_scaled_reward": -0.05938760610297322, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 3070.0833740234375, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.941576361656189, | |
| "kl": 0.61669921875, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.1055, | |
| "reward": 0.07732813712209463, | |
| "reward_std": 0.9496094807982445, | |
| "rewards/cosine_scaled_reward": -0.05677332216873765, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 3249.375045776367, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.7210645079612732, | |
| "kl": 0.67041015625, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0552, | |
| "reward": -0.3232395015656948, | |
| "reward_std": 0.6331962943077087, | |
| "rewards/cosine_scaled_reward": -0.21088325325399637, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 2833.7083892822266, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.5416091680526733, | |
| "kl": 0.570068359375, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0338, | |
| "reward": 0.2699696607887745, | |
| "reward_std": 0.769625548273325, | |
| "rewards/cosine_scaled_reward": 0.08322756737470627, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 2345.041702270508, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.4412481486797333, | |
| "kl": 0.44451904296875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0447, | |
| "reward": 0.4360041692852974, | |
| "reward_std": 0.8868755213916302, | |
| "rewards/cosine_scaled_reward": 0.08689466118812561, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2516.8333587646484, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.5790634155273438, | |
| "kl": 0.5372314453125, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0766, | |
| "reward": 0.1508978575002402, | |
| "reward_std": 0.7845667004585266, | |
| "rewards/cosine_scaled_reward": -0.10766792530193925, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 2367.6875534057617, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.465372771024704, | |
| "kl": 0.53857421875, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.045, | |
| "reward": 0.2600698294118047, | |
| "reward_std": 0.7807004749774933, | |
| "rewards/cosine_scaled_reward": -0.08423476293683052, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2789.3750915527344, | |
| "epoch": 0.288, | |
| "grad_norm": 0.5998123288154602, | |
| "kl": 0.5885009765625, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0437, | |
| "reward": 0.0935278192628175, | |
| "reward_std": 0.7258482351899147, | |
| "rewards/cosine_scaled_reward": -0.09951683203689754, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2500.1875915527344, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.6858162879943848, | |
| "kl": 0.5001220703125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0555, | |
| "reward": 0.2763403048738837, | |
| "reward_std": 0.9541792422533035, | |
| "rewards/cosine_scaled_reward": -0.020189424976706505, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 3007.7500610351562, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.7101835012435913, | |
| "kl": 0.715576171875, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0531, | |
| "reward": 0.011770043522119522, | |
| "reward_std": 0.6751254117116332, | |
| "rewards/cosine_scaled_reward": -0.06386499013751745, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2951.0625534057617, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.8391396403312683, | |
| "kl": 0.64337158203125, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0387, | |
| "reward": -0.2175804078578949, | |
| "reward_std": 0.633871290832758, | |
| "rewards/cosine_scaled_reward": -0.2334041576832533, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 2761.041702270508, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 1.0026954412460327, | |
| "kl": 0.5166015625, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0119, | |
| "reward": 0.14736154675483704, | |
| "reward_std": 0.7076679766178131, | |
| "rewards/cosine_scaled_reward": -0.052284312434494495, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 3124.2709350585938, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.9888486266136169, | |
| "kl": 0.50518798828125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0677, | |
| "reward": 0.3817547345533967, | |
| "reward_std": 0.9725684300065041, | |
| "rewards/cosine_scaled_reward": 0.10540459351614118, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 3022.979217529297, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 1.060575246810913, | |
| "kl": 0.49462890625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0775, | |
| "reward": 0.11859412118792534, | |
| "reward_std": 0.8907319009304047, | |
| "rewards/cosine_scaled_reward": -0.01629660092294216, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 2550.833366394043, | |
| "epoch": 0.296, | |
| "grad_norm": 0.4294055998325348, | |
| "kl": 0.43841552734375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.052, | |
| "reward": 0.30166149651631713, | |
| "reward_std": 0.831500705331564, | |
| "rewards/cosine_scaled_reward": 0.02446294855326414, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 2399.1250610351562, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.3650396168231964, | |
| "kl": 0.36968994140625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0443, | |
| "reward": 0.403296634554863, | |
| "reward_std": 0.926733735948801, | |
| "rewards/cosine_scaled_reward": 0.07911694049835205, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2862.2708892822266, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 1.088972568511963, | |
| "kl": 0.4329833984375, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.068, | |
| "reward": -0.11121963523328304, | |
| "reward_std": 0.6940420866012573, | |
| "rewards/cosine_scaled_reward": -0.19372293539345264, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 2654.479217529297, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.9074969291687012, | |
| "kl": 0.4139404296875, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0371, | |
| "reward": 0.052452532574534416, | |
| "reward_std": 0.6277891993522644, | |
| "rewards/cosine_scaled_reward": -0.18083440139889717, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2793.729202270508, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 2.0290629863739014, | |
| "kl": 0.50604248046875, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0191, | |
| "reward": -0.09252774063497782, | |
| "reward_std": 0.5941142216324806, | |
| "rewards/cosine_scaled_reward": -0.2058687130920589, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2861.9584350585938, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.7169579863548279, | |
| "kl": 0.4697265625, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0771, | |
| "reward": 0.00036056432873010635, | |
| "reward_std": 0.7790909744799137, | |
| "rewards/cosine_scaled_reward": -0.16522826021537185, | |
| "rewards/format_reward": 0.5833333469927311, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 2631.666732788086, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.7453656196594238, | |
| "kl": 0.45709228515625, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0216, | |
| "reward": 0.3167956112883985, | |
| "reward_std": 0.8776490315794945, | |
| "rewards/cosine_scaled_reward": -0.05162257980555296, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 2901.4583892822266, | |
| "epoch": 0.304, | |
| "grad_norm": 1.1252886056900024, | |
| "kl": 0.4161376953125, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": -0.0038, | |
| "reward": 0.05045348312705755, | |
| "reward_std": 0.8374455273151398, | |
| "rewards/cosine_scaled_reward": -0.1557634025812149, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 3027.8750762939453, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.44741290807724, | |
| "kl": 0.4798583984375, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0489, | |
| "reward": 0.042676386423408985, | |
| "reward_std": 0.7426804676651955, | |
| "rewards/cosine_scaled_reward": -0.14299316331744194, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2573.0208740234375, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.42620187997817993, | |
| "kl": 0.3807373046875, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0182, | |
| "reward": 0.010210057254880667, | |
| "reward_std": 0.6326607689261436, | |
| "rewards/cosine_scaled_reward": -0.11539698392152786, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 2693.4584350585938, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.8198751211166382, | |
| "kl": 0.38409423828125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0175, | |
| "reward": 0.04538612812757492, | |
| "reward_std": 0.8305856361985207, | |
| "rewards/cosine_scaled_reward": -0.16101938486099243, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 2817.5834197998047, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.6884281039237976, | |
| "kl": 0.3497314453125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0278, | |
| "reward": 0.19405698496848345, | |
| "reward_std": 0.9544314742088318, | |
| "rewards/cosine_scaled_reward": -0.05997245345497504, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 2640.4584045410156, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.48852798342704773, | |
| "kl": 0.32904052734375, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0483, | |
| "reward": 0.3868153728544712, | |
| "reward_std": 0.8188424855470657, | |
| "rewards/cosine_scaled_reward": 0.05347882490605116, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 3057.0000915527344, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.805952787399292, | |
| "kl": 0.4097900390625, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0648, | |
| "reward": -0.030033869668841362, | |
| "reward_std": 0.788214236497879, | |
| "rewards/cosine_scaled_reward": -0.1403372660279274, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 2664.3125610351562, | |
| "epoch": 0.312, | |
| "grad_norm": 0.6616376638412476, | |
| "kl": 0.384765625, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0021, | |
| "reward": 0.062399049289524555, | |
| "reward_std": 0.7391597256064415, | |
| "rewards/cosine_scaled_reward": -0.06876735016703606, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 2088.8541870117188, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 1.1467257738113403, | |
| "kl": 0.261138916015625, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": -0.0011, | |
| "reward": 0.5514285732060671, | |
| "reward_std": 0.6735243201255798, | |
| "rewards/cosine_scaled_reward": 0.1613447144627571, | |
| "rewards/format_reward": 0.6875, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2230.0625381469727, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.48795756697654724, | |
| "kl": 0.2476806640625, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0153, | |
| "reward": 0.4183642081916332, | |
| "reward_std": 0.9099490307271481, | |
| "rewards/cosine_scaled_reward": 0.07764563540695235, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2390.729248046875, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.5403981804847717, | |
| "kl": 0.2706298828125, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0278, | |
| "reward": 0.5140420459210873, | |
| "reward_std": 0.9649418145418167, | |
| "rewards/cosine_scaled_reward": 0.08755906065925956, | |
| "rewards/format_reward": 0.7500000204890966, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2273.7917404174805, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.681844174861908, | |
| "kl": 0.2330322265625, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0456, | |
| "reward": 0.39711499866098166, | |
| "reward_std": 0.8123867064714432, | |
| "rewards/cosine_scaled_reward": 0.028049522661603987, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 2178.3333740234375, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.31549885869026184, | |
| "kl": 0.264892578125, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0333, | |
| "reward": 0.46570797031745315, | |
| "reward_std": 0.5852809324860573, | |
| "rewards/cosine_scaled_reward": 0.054337045177817345, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 2957.604248046875, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.6503915786743164, | |
| "kl": 0.3946533203125, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0216, | |
| "reward": -0.10753144230693579, | |
| "reward_std": 0.821341261267662, | |
| "rewards/cosine_scaled_reward": -0.20013360609300435, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2075.479217529297, | |
| "epoch": 0.32, | |
| "grad_norm": 1.738971471786499, | |
| "kl": 0.265228271484375, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0882, | |
| "reward": 0.510447891894728, | |
| "reward_std": 0.914945088326931, | |
| "rewards/cosine_scaled_reward": 0.14110325649380684, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 3364.3333740234375, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.8055136203765869, | |
| "kl": 0.486083984375, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0224, | |
| "reward": -0.18871852289885283, | |
| "reward_std": 0.6349410191178322, | |
| "rewards/cosine_scaled_reward": -0.18654117919504642, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2701.916748046875, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.4943215548992157, | |
| "kl": 0.34820556640625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0322, | |
| "reward": 0.3567043347284198, | |
| "reward_std": 0.687431275844574, | |
| "rewards/cosine_scaled_reward": 0.010191468521952629, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2955.5000610351562, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 2.2376575469970703, | |
| "kl": 0.4698486328125, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1237, | |
| "reward": 0.17788631992880255, | |
| "reward_std": 0.9274262189865112, | |
| "rewards/cosine_scaled_reward": -0.03141580242663622, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2488.4584350585938, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.9498338103294373, | |
| "kl": 0.3389892578125, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0732, | |
| "reward": 0.36859262455254793, | |
| "reward_std": 0.7699229158461094, | |
| "rewards/cosine_scaled_reward": -0.00418412871658802, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 2094.3750762939453, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.24566514790058136, | |
| "kl": 0.274169921875, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0373, | |
| "reward": 0.21163646131753922, | |
| "reward_std": 0.6311895027756691, | |
| "rewards/cosine_scaled_reward": -0.09609892021398991, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2660.3333740234375, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.4989126920700073, | |
| "kl": 0.52783203125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0441, | |
| "reward": 0.1898297774605453, | |
| "reward_std": 0.8184352703392506, | |
| "rewards/cosine_scaled_reward": -0.08057594299316406, | |
| "rewards/format_reward": 0.6666666846722364, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1924.020881652832, | |
| "epoch": 0.328, | |
| "grad_norm": 1.0765957832336426, | |
| "kl": 0.527130126953125, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": -0.0004, | |
| "reward": 0.320914643118158, | |
| "reward_std": 0.6506007239222527, | |
| "rewards/cosine_scaled_reward": -0.0552152032032609, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 2751.916748046875, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.525725781917572, | |
| "kl": 0.47174072265625, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0345, | |
| "reward": 0.28691791370511055, | |
| "reward_std": 0.925068948417902, | |
| "rewards/cosine_scaled_reward": -0.05298358201980591, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2081.791702270508, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.4456733167171478, | |
| "kl": 0.426513671875, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0375, | |
| "reward": 0.2169907259522006, | |
| "reward_std": 0.6403507255017757, | |
| "rewards/cosine_scaled_reward": -0.062007103115320206, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 2641.5625915527344, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.49116048216819763, | |
| "kl": 0.56640625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0674, | |
| "reward": 0.32489653676748276, | |
| "reward_std": 0.7462290413677692, | |
| "rewards/cosine_scaled_reward": -0.07674999348819256, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 2844.3959045410156, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.9147380590438843, | |
| "kl": 0.5509033203125, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0864, | |
| "reward": 0.2941320105455816, | |
| "reward_std": 0.8829877898097038, | |
| "rewards/cosine_scaled_reward": -0.09763280488550663, | |
| "rewards/format_reward": 0.8333333544433117, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 2980.3126220703125, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 1.0147628784179688, | |
| "kl": 0.628662109375, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0385, | |
| "reward": 0.16018317895941436, | |
| "reward_std": 0.6120474711060524, | |
| "rewards/cosine_scaled_reward": -0.09870209451764822, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2222.3125610351562, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 1.3469599485397339, | |
| "kl": 0.42333984375, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": -0.0092, | |
| "reward": 0.35159002151340246, | |
| "reward_std": 0.6072813756763935, | |
| "rewards/cosine_scaled_reward": -0.03080323152244091, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 3145.1041870117188, | |
| "epoch": 0.336, | |
| "grad_norm": 1.195894718170166, | |
| "kl": 0.774658203125, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0366, | |
| "reward": -0.011713245883584023, | |
| "reward_std": 0.686446376144886, | |
| "rewards/cosine_scaled_reward": -0.1353329624980688, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2874.3126068115234, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.6826883554458618, | |
| "kl": 0.62451171875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0648, | |
| "reward": 0.4660371467471123, | |
| "reward_std": 0.8896083161234856, | |
| "rewards/cosine_scaled_reward": 0.10532690212130547, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 3109.916748046875, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.6580361723899841, | |
| "kl": 0.54638671875, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0593, | |
| "reward": 0.1299741494731279, | |
| "reward_std": 0.7981628403067589, | |
| "rewards/cosine_scaled_reward": -0.1501350817270577, | |
| "rewards/format_reward": 0.7291666977107525, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 3412.6875915527344, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.5886114835739136, | |
| "kl": 0.65771484375, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0604, | |
| "reward": -0.13705510459840298, | |
| "reward_std": 0.7897306978702545, | |
| "rewards/cosine_scaled_reward": -0.1760294260457158, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2516.0000610351562, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.9390971660614014, | |
| "kl": 0.326019287109375, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": -0.005, | |
| "reward": 0.2075387438526377, | |
| "reward_std": 0.678107738494873, | |
| "rewards/cosine_scaled_reward": -0.06827879883348942, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 2798.750045776367, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.32884588837623596, | |
| "kl": 0.3729248046875, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.029, | |
| "reward": 0.3677529713604599, | |
| "reward_std": 0.581198662519455, | |
| "rewards/cosine_scaled_reward": 0.07487833127379417, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 3013.9584350585938, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.45819181203842163, | |
| "kl": 0.378662109375, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0371, | |
| "reward": 0.11145258648321033, | |
| "reward_std": 0.8130291737616062, | |
| "rewards/cosine_scaled_reward": -0.12973832013085485, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2654.791717529297, | |
| "epoch": 0.344, | |
| "grad_norm": 0.4394110143184662, | |
| "kl": 0.3597412109375, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0512, | |
| "reward": 0.15728470450267196, | |
| "reward_std": 0.8084782175719738, | |
| "rewards/cosine_scaled_reward": -0.07348441705107689, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 2718.750030517578, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 1.5091288089752197, | |
| "kl": 0.374420166015625, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": -0.0067, | |
| "reward": -0.010899038054049015, | |
| "reward_std": 0.6495076231658459, | |
| "rewards/cosine_scaled_reward": -0.1349128014408052, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 2785.0833740234375, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.6161960363388062, | |
| "kl": 0.3387451171875, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0533, | |
| "reward": 0.19499525055289268, | |
| "reward_std": 0.9087162502110004, | |
| "rewards/cosine_scaled_reward": -0.10367321688681841, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 2953.541778564453, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.4118628203868866, | |
| "kl": 0.3302001953125, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.05, | |
| "reward": 0.17398711014539003, | |
| "reward_std": 0.7196006700396538, | |
| "rewards/cosine_scaled_reward": -0.13903226237744093, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 3122.791748046875, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.9769808650016785, | |
| "kl": 0.315673828125, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0601, | |
| "reward": 0.1985210245475173, | |
| "reward_std": 0.7755458503961563, | |
| "rewards/cosine_scaled_reward": -0.06837073154747486, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 2781.9584350585938, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.8336344957351685, | |
| "kl": 0.295135498046875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": -0.0088, | |
| "reward": 0.34240800654515624, | |
| "reward_std": 0.7838171310722828, | |
| "rewards/cosine_scaled_reward": 0.005472499451570911, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 2925.625045776367, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 1.742169737815857, | |
| "kl": 0.31103515625, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0824, | |
| "reward": 0.24728509783744812, | |
| "reward_std": 1.0561788342893124, | |
| "rewards/cosine_scaled_reward": -0.03675899375230074, | |
| "rewards/format_reward": 0.625000013038516, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 3307.916748046875, | |
| "epoch": 0.352, | |
| "grad_norm": 0.5225728154182434, | |
| "kl": 0.390625, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0089, | |
| "reward": 0.06603662599809468, | |
| "reward_std": 0.6128693893551826, | |
| "rewards/cosine_scaled_reward": -0.17313291411846876, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2979.479217529297, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.4211791157722473, | |
| "kl": 0.259429931640625, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0147, | |
| "reward": 0.2866139570251107, | |
| "reward_std": 0.7773850671947002, | |
| "rewards/cosine_scaled_reward": -0.009212411940097809, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 2679.1667404174805, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.5345349907875061, | |
| "kl": 0.33343505859375, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0432, | |
| "reward": 0.24587753415107727, | |
| "reward_std": 0.8905804492533207, | |
| "rewards/cosine_scaled_reward": -0.016845128498971462, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 2646.2500610351562, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.32929864525794983, | |
| "kl": 0.261383056640625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0258, | |
| "reward": 0.1976035200059414, | |
| "reward_std": 0.8284550718963146, | |
| "rewards/cosine_scaled_reward": -0.04515882022678852, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 2335.729217529297, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.296840637922287, | |
| "kl": 0.254730224609375, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0257, | |
| "reward": 0.5823600944131613, | |
| "reward_std": 0.691787600517273, | |
| "rewards/cosine_scaled_reward": 0.1733840461820364, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2918.68758392334, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.8261905312538147, | |
| "kl": 0.3216552734375, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0615, | |
| "reward": 0.32681620866060257, | |
| "reward_std": 0.8109251782298088, | |
| "rewards/cosine_scaled_reward": 0.06859785690903664, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2584.812545776367, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.28141534328460693, | |
| "kl": 0.24951171875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": -0.0017, | |
| "reward": 0.5454013433773071, | |
| "reward_std": 0.7297746650874615, | |
| "rewards/cosine_scaled_reward": 0.13757295534014702, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 3015.229232788086, | |
| "epoch": 0.36, | |
| "grad_norm": 0.40665552020072937, | |
| "kl": 0.357452392578125, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0237, | |
| "reward": 0.010891908779740334, | |
| "reward_std": 0.7054996266961098, | |
| "rewards/cosine_scaled_reward": -0.11506736988667399, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 3297.6875610351562, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.38543111085891724, | |
| "kl": 0.36572265625, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0324, | |
| "reward": -0.13673021260183305, | |
| "reward_std": 0.5753876939415932, | |
| "rewards/cosine_scaled_reward": -0.2870428040623665, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 3066.8958892822266, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.5159828066825867, | |
| "kl": 0.4085693359375, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0244, | |
| "reward": 0.13786310516297817, | |
| "reward_std": 0.6435285620391369, | |
| "rewards/cosine_scaled_reward": -0.11537853349000216, | |
| "rewards/format_reward": 0.6875000279396772, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 2283.9375610351562, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.4553980827331543, | |
| "kl": 0.24249267578125, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": -0.0024, | |
| "reward": 0.41944058798253536, | |
| "reward_std": 0.9410093426704407, | |
| "rewards/cosine_scaled_reward": -0.0460501410998404, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 3003.750030517578, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.3188877999782562, | |
| "kl": 0.35693359375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0374, | |
| "reward": -0.02601405733730644, | |
| "reward_std": 0.7571947351098061, | |
| "rewards/cosine_scaled_reward": -0.16905247140675783, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 2351.791748046875, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.28264015913009644, | |
| "kl": 0.226318359375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0313, | |
| "reward": 0.41428154334425926, | |
| "reward_std": 0.6797180473804474, | |
| "rewards/cosine_scaled_reward": -0.014672968536615372, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 2420.500099182129, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.3555741012096405, | |
| "kl": 0.2991943359375, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0064, | |
| "reward": 0.3246429590508342, | |
| "reward_std": 0.84641108289361, | |
| "rewards/cosine_scaled_reward": -0.021552213234826922, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2859.395950317383, | |
| "epoch": 0.368, | |
| "grad_norm": 0.3224516808986664, | |
| "kl": 0.27166748046875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.013, | |
| "reward": 0.41165721049765125, | |
| "reward_std": 0.8220065757632256, | |
| "rewards/cosine_scaled_reward": 0.0005064834840595722, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2920.3125610351562, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.7611133456230164, | |
| "kl": 0.2857666015625, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0486, | |
| "reward": 0.29905556747689843, | |
| "reward_std": 0.7838596850633621, | |
| "rewards/cosine_scaled_reward": -0.0440771235153079, | |
| "rewards/format_reward": 0.7500000223517418, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 3198.3334045410156, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.5829113125801086, | |
| "kl": 0.34033203125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.035, | |
| "reward": 0.06515605933964252, | |
| "reward_std": 0.6232591606676579, | |
| "rewards/cosine_scaled_reward": -0.12263932824134827, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 2887.104202270508, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.3115164339542389, | |
| "kl": 0.286163330078125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0308, | |
| "reward": 0.04865055438131094, | |
| "reward_std": 0.769499409943819, | |
| "rewards/cosine_scaled_reward": -0.14275423251092434, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2667.5834045410156, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.2706601023674011, | |
| "kl": 0.24713134765625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0352, | |
| "reward": 0.3725670697167516, | |
| "reward_std": 0.7936568856239319, | |
| "rewards/cosine_scaled_reward": -0.0039613001281395555, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2546.291702270508, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.2914826273918152, | |
| "kl": 0.26849365234375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0326, | |
| "reward": 0.36802724679000676, | |
| "reward_std": 0.7584006376564503, | |
| "rewards/cosine_scaled_reward": 0.046540172887034714, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 3366.104217529297, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.42033851146698, | |
| "kl": 0.346923828125, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.04, | |
| "reward": -0.22989960946142673, | |
| "reward_std": 0.757425207644701, | |
| "rewards/cosine_scaled_reward": -0.22673101257532835, | |
| "rewards/format_reward": 0.3958333469927311, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2347.7500534057617, | |
| "epoch": 0.376, | |
| "grad_norm": 0.8453741073608398, | |
| "kl": 0.227630615234375, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0358, | |
| "reward": 0.4706056764116511, | |
| "reward_std": 0.8056343197822571, | |
| "rewards/cosine_scaled_reward": 0.021218265290372074, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2407.020866394043, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.7580931782722473, | |
| "kl": 0.237945556640625, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": -0.0101, | |
| "reward": 0.12094018794596195, | |
| "reward_std": 0.6862670034170151, | |
| "rewards/cosine_scaled_reward": -0.19688204117119312, | |
| "rewards/format_reward": 0.8125000018626451, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2858.2083740234375, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.528272271156311, | |
| "kl": 0.30035400390625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0324, | |
| "reward": -0.01362695125862956, | |
| "reward_std": 0.7233411334455013, | |
| "rewards/cosine_scaled_reward": -0.1397905834019184, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2584.729217529297, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.6263169050216675, | |
| "kl": 0.2864990234375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0382, | |
| "reward": 0.33939856104552746, | |
| "reward_std": 0.8283688835799694, | |
| "rewards/cosine_scaled_reward": 0.012080416432581842, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 2398.562545776367, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.2747509777545929, | |
| "kl": 0.20452880859375, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0178, | |
| "reward": 0.3634027219377458, | |
| "reward_std": 0.7900962755084038, | |
| "rewards/cosine_scaled_reward": -0.07603580364957452, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 3390.7500915527344, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.6542767882347107, | |
| "kl": 0.46240234375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0728, | |
| "reward": -0.052195049822330475, | |
| "reward_std": 0.894780658185482, | |
| "rewards/cosine_scaled_reward": -0.17814880423247814, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2516.604217529297, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.2772899866104126, | |
| "kl": 0.257080078125, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0318, | |
| "reward": 0.1776880531979259, | |
| "reward_std": 0.7250074371695518, | |
| "rewards/cosine_scaled_reward": -0.06235098314937204, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 3047.0000915527344, | |
| "epoch": 0.384, | |
| "grad_norm": 1.0677460432052612, | |
| "kl": 0.34222412109375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0384, | |
| "reward": 0.3056840244680643, | |
| "reward_std": 0.8023335263133049, | |
| "rewards/cosine_scaled_reward": -0.030411685816943645, | |
| "rewards/format_reward": 0.729166692122817, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 2612.916748046875, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 1.6512010097503662, | |
| "kl": 0.316650390625, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0747, | |
| "reward": 0.456050219014287, | |
| "reward_std": 1.0556022450327873, | |
| "rewards/cosine_scaled_reward": 0.01445878017693758, | |
| "rewards/format_reward": 0.8125000223517418, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 2305.4791870117188, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.9437749981880188, | |
| "kl": 0.30230712890625, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": -0.0135, | |
| "reward": 0.5471778312930837, | |
| "reward_std": 0.794790405780077, | |
| "rewards/cosine_scaled_reward": 0.12062014266848564, | |
| "rewards/format_reward": 0.75, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 3010.1250534057617, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.5747052431106567, | |
| "kl": 0.360382080078125, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0191, | |
| "reward": -0.03356679296121001, | |
| "reward_std": 0.7060465253889561, | |
| "rewards/cosine_scaled_reward": -0.17198694869875908, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2871.2709045410156, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.5294919610023499, | |
| "kl": 0.3682861328125, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0492, | |
| "reward": 0.29094321094453335, | |
| "reward_std": 0.7570115067064762, | |
| "rewards/cosine_scaled_reward": -0.11235274095088243, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2639.062545776367, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.45286688208580017, | |
| "kl": 0.29937744140625, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0171, | |
| "reward": 0.6249844692647457, | |
| "reward_std": 0.815760787576437, | |
| "rewards/cosine_scaled_reward": 0.17363815288990736, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2894.0208587646484, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.3460341691970825, | |
| "kl": 0.4681396484375, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0579, | |
| "reward": 0.08103763521648943, | |
| "reward_std": 0.7632105089724064, | |
| "rewards/cosine_scaled_reward": -0.1950855739414692, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 2808.6250610351562, | |
| "epoch": 0.392, | |
| "grad_norm": 1.3520926237106323, | |
| "kl": 0.3250732421875, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0791, | |
| "reward": 0.48628374096006155, | |
| "reward_std": 0.8851627111434937, | |
| "rewards/cosine_scaled_reward": 0.07822982332436368, | |
| "rewards/format_reward": 0.7500000167638063, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2434.958366394043, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.38433942198753357, | |
| "kl": 0.2788543701171875, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0233, | |
| "reward": 0.6341738551855087, | |
| "reward_std": 0.7020405307412148, | |
| "rewards/cosine_scaled_reward": 0.1561431773006916, | |
| "rewards/format_reward": 0.8125000037252903, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2946.3958892822266, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.9588437080383301, | |
| "kl": 0.4776611328125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0089, | |
| "reward": 0.4796811621636152, | |
| "reward_std": 0.8463685475289822, | |
| "rewards/cosine_scaled_reward": 0.07671219296753407, | |
| "rewards/format_reward": 0.7500000037252903, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 2810.916717529297, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.7113232612609863, | |
| "kl": 0.45458984375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0365, | |
| "reward": 0.29816207382827997, | |
| "reward_std": 0.8527140244841576, | |
| "rewards/cosine_scaled_reward": -0.028450995916500688, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 3077.9584045410156, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 1.0087529420852661, | |
| "kl": 0.5037841796875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.027, | |
| "reward": -0.07710191514343023, | |
| "reward_std": 0.6414642743766308, | |
| "rewards/cosine_scaled_reward": -0.2412154171615839, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2701.1042709350586, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.7987737059593201, | |
| "kl": 0.3995208740234375, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0226, | |
| "reward": 0.299939407967031, | |
| "reward_std": 0.602581400424242, | |
| "rewards/cosine_scaled_reward": -0.08414606470614672, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 2987.229278564453, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 1.26498544216156, | |
| "kl": 0.451416015625, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0201, | |
| "reward": 0.11506683845072985, | |
| "reward_std": 0.7807445377111435, | |
| "rewards/cosine_scaled_reward": -0.13993432931602, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 2590.395950317383, | |
| "epoch": 0.4, | |
| "grad_norm": 0.5970591902732849, | |
| "kl": 0.4644775390625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0338, | |
| "reward": 0.18042142933700234, | |
| "reward_std": 0.8871416859328747, | |
| "rewards/cosine_scaled_reward": -0.14772336441092193, | |
| "rewards/format_reward": 0.7708333544433117, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 2721.6250762939453, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.43472933769226074, | |
| "kl": 0.25958251953125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0395, | |
| "reward": 0.42602336849085987, | |
| "reward_std": 0.8064659312367439, | |
| "rewards/cosine_scaled_reward": 0.042032238095998764, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2447.8334045410156, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 1.039471983909607, | |
| "kl": 0.2555999755859375, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0413, | |
| "reward": 0.24063719739206135, | |
| "reward_std": 0.8650816380977631, | |
| "rewards/cosine_scaled_reward": -0.0003559635952115059, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 2217.14591217041, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.41167160868644714, | |
| "kl": 0.203216552734375, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0232, | |
| "reward": 0.5322136869654059, | |
| "reward_std": 0.666268203407526, | |
| "rewards/cosine_scaled_reward": 0.04740488715469837, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 2258.625045776367, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.22829324007034302, | |
| "kl": 0.180816650390625, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0048, | |
| "reward": 0.5726254731416702, | |
| "reward_std": 0.7559525668621063, | |
| "rewards/cosine_scaled_reward": 0.12823878531344235, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 2250.791778564453, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.602950394153595, | |
| "kl": 0.2418212890625, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0347, | |
| "reward": 0.6632253341376781, | |
| "reward_std": 0.8946371823549271, | |
| "rewards/cosine_scaled_reward": 0.15655125584453344, | |
| "rewards/format_reward": 0.8333333544433117, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2813.9583892822266, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.49410441517829895, | |
| "kl": 0.27777099609375, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0194, | |
| "reward": 0.21525360643863678, | |
| "reward_std": 0.9108888022601604, | |
| "rewards/cosine_scaled_reward": -0.058567093685269356, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 3031.3751220703125, | |
| "epoch": 0.408, | |
| "grad_norm": 0.46776533126831055, | |
| "kl": 0.290283203125, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0666, | |
| "reward": 0.11198169272392988, | |
| "reward_std": 0.7659357041120529, | |
| "rewards/cosine_scaled_reward": -0.17274215212091804, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2567.7709197998047, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.9789031744003296, | |
| "kl": 0.2848052978515625, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.082, | |
| "reward": 0.4794031195342541, | |
| "reward_std": 0.9315466657280922, | |
| "rewards/cosine_scaled_reward": 0.028890431160107255, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 2519.666702270508, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.3785220980644226, | |
| "kl": 0.329376220703125, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0228, | |
| "reward": 0.32482871878892183, | |
| "reward_std": 0.804768543690443, | |
| "rewards/cosine_scaled_reward": -0.028755411505699158, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 2652.604263305664, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.5460115075111389, | |
| "kl": 0.22186279296875, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0358, | |
| "reward": 0.5476951543241739, | |
| "reward_std": 0.8413685485720634, | |
| "rewards/cosine_scaled_reward": 0.047556648729369044, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 2775.6251220703125, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.6770034432411194, | |
| "kl": 0.27569580078125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0547, | |
| "reward": 0.21503734902944416, | |
| "reward_std": 0.7295170053839684, | |
| "rewards/cosine_scaled_reward": -0.1001668400131166, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1685.1875305175781, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.2777215838432312, | |
| "kl": 0.158172607421875, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0005, | |
| "reward": 0.4802695903927088, | |
| "reward_std": 0.6514372602105141, | |
| "rewards/cosine_scaled_reward": 0.03301592729985714, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 1911.145896911621, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.5700949430465698, | |
| "kl": 0.2712554931640625, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0486, | |
| "reward": 0.3370086522772908, | |
| "reward_std": 0.7742916233837605, | |
| "rewards/cosine_scaled_reward": 0.0005665780045092106, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 2883.229248046875, | |
| "epoch": 0.416, | |
| "grad_norm": 0.5325201749801636, | |
| "kl": 0.36102294921875, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0226, | |
| "reward": -0.025897093466483057, | |
| "reward_std": 0.6142625138163567, | |
| "rewards/cosine_scaled_reward": -0.23417398636229336, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 2721.312545776367, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.7317801713943481, | |
| "kl": 0.36126708984375, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0568, | |
| "reward": 0.3746628388762474, | |
| "reward_std": 0.8317583128809929, | |
| "rewards/cosine_scaled_reward": 0.022838744800537825, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 2259.645854949951, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.5810126662254333, | |
| "kl": 0.244354248046875, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0253, | |
| "reward": 0.4160766340792179, | |
| "reward_std": 0.7030897587537766, | |
| "rewards/cosine_scaled_reward": 0.050911818630993366, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 3181.5209350585938, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.4794647991657257, | |
| "kl": 0.4725341796875, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0514, | |
| "reward": 0.09267363836988807, | |
| "reward_std": 0.7625463083386421, | |
| "rewards/cosine_scaled_reward": -0.12190989265218377, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 2849.666732788086, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 1.0089714527130127, | |
| "kl": 0.32720947265625, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": -0.0027, | |
| "reward": 0.1751155611127615, | |
| "reward_std": 0.549890723079443, | |
| "rewards/cosine_scaled_reward": -0.07263889070600271, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 2855.750045776367, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.9463785290718079, | |
| "kl": 0.40081787109375, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0535, | |
| "reward": 0.1500562410801649, | |
| "reward_std": 0.7940640598535538, | |
| "rewards/cosine_scaled_reward": -0.10133596323430538, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 2896.3959197998047, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.8330612778663635, | |
| "kl": 0.416717529296875, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0286, | |
| "reward": 0.183703294955194, | |
| "reward_std": 0.45754858292639256, | |
| "rewards/cosine_scaled_reward": -0.09295342303812504, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1955.479263305664, | |
| "epoch": 0.424, | |
| "grad_norm": 0.3391231894493103, | |
| "kl": 0.262359619140625, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0184, | |
| "reward": 0.6992178884102032, | |
| "reward_std": 0.6993260197341442, | |
| "rewards/cosine_scaled_reward": 0.16803688369691372, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 3005.6458740234375, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.5167466402053833, | |
| "kl": 0.4355926513671875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0432, | |
| "reward": 0.19448892027139664, | |
| "reward_std": 0.7890233434736729, | |
| "rewards/cosine_scaled_reward": -0.021921467036008835, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 1942.3333892822266, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.3325159549713135, | |
| "kl": 0.2429656982421875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0212, | |
| "reward": 0.3014055141247809, | |
| "reward_std": 0.8629297837615013, | |
| "rewards/cosine_scaled_reward": -0.07299621542915702, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 2335.020866394043, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.8563280701637268, | |
| "kl": 0.3255615234375, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0616, | |
| "reward": 0.2297945898026228, | |
| "reward_std": 0.9343635328114033, | |
| "rewards/cosine_scaled_reward": -0.047990256920456886, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 2628.291748046875, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.9893895387649536, | |
| "kl": 0.34588623046875, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0665, | |
| "reward": 0.35648069988383213, | |
| "reward_std": 0.7849867083132267, | |
| "rewards/cosine_scaled_reward": 0.0042181313037872314, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 2365.6459045410156, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.37569403648376465, | |
| "kl": 0.351654052734375, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0279, | |
| "reward": 0.13466822169721127, | |
| "reward_std": 0.7210676558315754, | |
| "rewards/cosine_scaled_reward": -0.1441901307553053, | |
| "rewards/format_reward": 0.7291666753590107, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 3061.979278564453, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.7083445191383362, | |
| "kl": 0.4854736328125, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0347, | |
| "reward": 0.34864536579698324, | |
| "reward_std": 1.0110743790864944, | |
| "rewards/cosine_scaled_reward": 0.04069389193318784, | |
| "rewards/format_reward": 0.6250000186264515, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 2377.3542709350586, | |
| "epoch": 0.432, | |
| "grad_norm": 0.49458223581314087, | |
| "kl": 0.311187744140625, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0331, | |
| "reward": 0.5616534340661019, | |
| "reward_std": 0.7573869004845619, | |
| "rewards/cosine_scaled_reward": 0.08385843969881535, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 2865.729248046875, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 1.2933725118637085, | |
| "kl": 0.4803466796875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0196, | |
| "reward": 0.3152011390775442, | |
| "reward_std": 0.7045796066522598, | |
| "rewards/cosine_scaled_reward": -0.010164887178689241, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 2530.7708892822266, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.6860670447349548, | |
| "kl": 0.32769775390625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0408, | |
| "reward": 0.2659956905990839, | |
| "reward_std": 0.727360412478447, | |
| "rewards/cosine_scaled_reward": -0.08539670892059803, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 2727.3333587646484, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.8145273923873901, | |
| "kl": 0.439697265625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0334, | |
| "reward": 0.08977367201441666, | |
| "reward_std": 0.808791808784008, | |
| "rewards/cosine_scaled_reward": -0.1487837778404355, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 2468.104248046875, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.42541611194610596, | |
| "kl": 0.37420654296875, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0623, | |
| "reward": 0.18184729665517807, | |
| "reward_std": 0.8058411814272404, | |
| "rewards/cosine_scaled_reward": -0.1598250768147409, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 2856.6458892822266, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.6056004762649536, | |
| "kl": 0.4031982421875, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.044, | |
| "reward": 0.46744489343836904, | |
| "reward_std": 0.9438849911093712, | |
| "rewards/cosine_scaled_reward": 0.08236491866409779, | |
| "rewards/format_reward": 0.7083333488553762, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 2167.7500762939453, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.6828803420066833, | |
| "kl": 0.2847900390625, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0052, | |
| "reward": 0.8705862760543823, | |
| "reward_std": 0.8881346955895424, | |
| "rewards/cosine_scaled_reward": 0.32432539528235793, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 2416.9167098999023, | |
| "epoch": 0.44, | |
| "grad_norm": 0.6516284942626953, | |
| "kl": 0.360748291015625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0588, | |
| "reward": 0.24877717718482018, | |
| "reward_std": 0.735356081277132, | |
| "rewards/cosine_scaled_reward": -0.08876009099185467, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 2890.8959350585938, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.6051266193389893, | |
| "kl": 0.4339599609375, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0433, | |
| "reward": 0.37795007787644863, | |
| "reward_std": 0.8708282820880413, | |
| "rewards/cosine_scaled_reward": 0.02761521004140377, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 2727.416778564453, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.4502924680709839, | |
| "kl": 0.392852783203125, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0409, | |
| "reward": 0.40206454193685204, | |
| "reward_std": 0.8078175410628319, | |
| "rewards/cosine_scaled_reward": 0.03367285244166851, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 2498.291732788086, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.7455759048461914, | |
| "kl": 0.421722412109375, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0377, | |
| "reward": 0.4461521580815315, | |
| "reward_std": 0.6339214891195297, | |
| "rewards/cosine_scaled_reward": 0.06488732434809208, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 2519.2500610351562, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.3315029740333557, | |
| "kl": 0.347686767578125, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0373, | |
| "reward": 0.41832172160502523, | |
| "reward_std": 0.8497539162635803, | |
| "rewards/cosine_scaled_reward": 0.04175483621656895, | |
| "rewards/format_reward": 0.7291666846722364, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 2520.270881652832, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.6555607914924622, | |
| "kl": 0.3785400390625, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0494, | |
| "reward": 0.36745146568864584, | |
| "reward_std": 0.7271608784794807, | |
| "rewards/cosine_scaled_reward": 0.005871989764273167, | |
| "rewards/format_reward": 0.7500000223517418, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 2606.7916870117188, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.5517706871032715, | |
| "kl": 0.4510498046875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0531, | |
| "reward": 0.30924548767507076, | |
| "reward_std": 0.7186228446662426, | |
| "rewards/cosine_scaled_reward": 0.02813367173075676, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 2368.2500762939453, | |
| "epoch": 0.448, | |
| "grad_norm": 1.1368578672409058, | |
| "kl": 0.367889404296875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0712, | |
| "reward": 0.43708939105272293, | |
| "reward_std": 0.9549101106822491, | |
| "rewards/cosine_scaled_reward": -0.014195648953318596, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 2708.354217529297, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 1.338770866394043, | |
| "kl": 0.411865234375, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0669, | |
| "reward": 0.5190371284261346, | |
| "reward_std": 1.0145683512091637, | |
| "rewards/cosine_scaled_reward": 0.11480109271360561, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 3059.1250610351562, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.698340117931366, | |
| "kl": 0.5703125, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0558, | |
| "reward": 0.08016422716900706, | |
| "reward_std": 0.9071974717080593, | |
| "rewards/cosine_scaled_reward": -0.10081264981999993, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 2238.604232788086, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.527318000793457, | |
| "kl": 0.3275146484375, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0224, | |
| "reward": 0.518818385200575, | |
| "reward_std": 0.812805999070406, | |
| "rewards/cosine_scaled_reward": 0.07729056139942259, | |
| "rewards/format_reward": 0.7916666902601719, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 2808.7084045410156, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 1.1238527297973633, | |
| "kl": 0.48876953125, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.1152, | |
| "reward": 0.2978093853453174, | |
| "reward_std": 0.9074007868766785, | |
| "rewards/cosine_scaled_reward": -0.03470429126173258, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2351.8126068115234, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.3706618845462799, | |
| "kl": 0.246002197265625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0129, | |
| "reward": 0.37504480965435505, | |
| "reward_std": 0.5897877439856529, | |
| "rewards/cosine_scaled_reward": -0.05014793388545513, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 2328.37508392334, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.7165157198905945, | |
| "kl": 0.3564453125, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0288, | |
| "reward": 0.3573523070663214, | |
| "reward_std": 0.7765154354274273, | |
| "rewards/cosine_scaled_reward": 0.0030241278000175953, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 2353.854202270508, | |
| "epoch": 0.456, | |
| "grad_norm": 1.2241944074630737, | |
| "kl": 0.3509521484375, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0858, | |
| "reward": 0.7341470178216696, | |
| "reward_std": 1.0539017170667648, | |
| "rewards/cosine_scaled_reward": 0.18186382483690977, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1577.7917098999023, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.081485390663147, | |
| "kl": 0.142333984375, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": -0.0309, | |
| "reward": 0.879038143903017, | |
| "reward_std": 0.9596639424562454, | |
| "rewards/cosine_scaled_reward": 0.2600486520677805, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 2888.479217529297, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.948920726776123, | |
| "kl": 0.559173583984375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0508, | |
| "reward": 0.2996121197938919, | |
| "reward_std": 0.6879791766405106, | |
| "rewards/cosine_scaled_reward": -0.01863069087266922, | |
| "rewards/format_reward": 0.7083333488553762, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 2353.104217529297, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.6571682095527649, | |
| "kl": 0.3966827392578125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0259, | |
| "reward": 0.2608861066401005, | |
| "reward_std": 0.6378191784024239, | |
| "rewards/cosine_scaled_reward": -0.031162479892373085, | |
| "rewards/format_reward": 0.6875000167638063, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 2268.104232788086, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.6540146470069885, | |
| "kl": 0.3348388671875, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0243, | |
| "reward": 0.3493347900584922, | |
| "reward_std": 0.7477662637829781, | |
| "rewards/cosine_scaled_reward": -0.0498207900673151, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 2252.0625610351562, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.5845898985862732, | |
| "kl": 0.31449127197265625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0223, | |
| "reward": 0.3203935632482171, | |
| "reward_std": 0.5589652694761753, | |
| "rewards/cosine_scaled_reward": -0.0724391471594572, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 2176.437557220459, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.5805149078369141, | |
| "kl": 0.4361114501953125, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0716, | |
| "reward": 0.5914140390232205, | |
| "reward_std": 0.9176547713577747, | |
| "rewards/cosine_scaled_reward": 0.1673335493542254, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 2222.3125762939453, | |
| "epoch": 0.464, | |
| "grad_norm": 0.714565098285675, | |
| "kl": 0.348388671875, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.053, | |
| "reward": 0.5401954464614391, | |
| "reward_std": 0.9346184208989143, | |
| "rewards/cosine_scaled_reward": 0.03536188416182995, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2257.3542404174805, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.7260815501213074, | |
| "kl": 0.316375732421875, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0606, | |
| "reward": 0.6055942573584616, | |
| "reward_std": 0.6878091357648373, | |
| "rewards/cosine_scaled_reward": 0.12408588983817026, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2682.041717529297, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.7148492336273193, | |
| "kl": 0.502105712890625, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0441, | |
| "reward": 0.5293791117146611, | |
| "reward_std": 0.863475501537323, | |
| "rewards/cosine_scaled_reward": 0.14691544696688652, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 3076.0834045410156, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.8249404430389404, | |
| "kl": 0.6727294921875, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0616, | |
| "reward": 0.002814117819070816, | |
| "reward_std": 0.8114083893597126, | |
| "rewards/cosine_scaled_reward": -0.10041821748018265, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2465.000087738037, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.6067022681236267, | |
| "kl": 0.5772857666015625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0528, | |
| "reward": 0.12211680319160223, | |
| "reward_std": 0.7772498056292534, | |
| "rewards/cosine_scaled_reward": -0.13637491036206484, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 2848.7708740234375, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.5674929022789001, | |
| "kl": 0.5439453125, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0797, | |
| "reward": 0.23797017033211887, | |
| "reward_std": 0.8856572322547436, | |
| "rewards/cosine_scaled_reward": -0.07174723839852959, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2702.9375610351562, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.9189234375953674, | |
| "kl": 0.4500732421875, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.025, | |
| "reward": 0.47001307643949986, | |
| "reward_std": 0.9889923110604286, | |
| "rewards/cosine_scaled_reward": 0.04870818182826042, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 2750.1250762939453, | |
| "epoch": 0.472, | |
| "grad_norm": 0.7404162287712097, | |
| "kl": 0.49786376953125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0398, | |
| "reward": 0.37627727701328695, | |
| "reward_std": 0.7015467956662178, | |
| "rewards/cosine_scaled_reward": -0.021446891129016876, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 3040.6459350585938, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 1.531003475189209, | |
| "kl": 0.66845703125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0372, | |
| "reward": -0.023312292993068695, | |
| "reward_std": 0.6657886579632759, | |
| "rewards/cosine_scaled_reward": -0.20822268491610885, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 3178.354248046875, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 1.0874366760253906, | |
| "kl": 0.808349609375, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0745, | |
| "reward": -0.028731117257848382, | |
| "reward_std": 0.8625498786568642, | |
| "rewards/cosine_scaled_reward": -0.1594004575163126, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 2240.5625610351562, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 1.3929944038391113, | |
| "kl": 0.28546142578125, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0775, | |
| "reward": 0.5215831075329334, | |
| "reward_std": 0.8785289078950882, | |
| "rewards/cosine_scaled_reward": 0.049687013030052185, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 3151.9375915527344, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.6912859678268433, | |
| "kl": 0.676025390625, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0716, | |
| "reward": -0.13146568089723587, | |
| "reward_std": 0.7075865548104048, | |
| "rewards/cosine_scaled_reward": -0.1975945346057415, | |
| "rewards/format_reward": 0.47916668467223644, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 2078.3542289733887, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 1.4748533964157104, | |
| "kl": 0.241912841796875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": -0.0234, | |
| "reward": 0.6464557002764195, | |
| "reward_std": 0.8331911526620388, | |
| "rewards/cosine_scaled_reward": 0.1137741282582283, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 2601.4584197998047, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.4618014395236969, | |
| "kl": 0.39984130859375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0497, | |
| "reward": 0.6187925288686529, | |
| "reward_std": 0.9556396976113319, | |
| "rewards/cosine_scaled_reward": 0.1003096466884017, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 2392.33341217041, | |
| "epoch": 0.48, | |
| "grad_norm": 0.7824205160140991, | |
| "kl": 0.47357177734375, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0699, | |
| "reward": 0.2953487314807717, | |
| "reward_std": 0.6610889099538326, | |
| "rewards/cosine_scaled_reward": -0.03334581479430199, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 3098.0834045410156, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 1.4201315641403198, | |
| "kl": 0.530029296875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0068, | |
| "reward": -0.07355257519520819, | |
| "reward_std": 0.6699985489249229, | |
| "rewards/cosine_scaled_reward": -0.20630090683698654, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2704.5833892822266, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.31716275215148926, | |
| "kl": 0.3623046875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0337, | |
| "reward": 0.30903656780719757, | |
| "reward_std": 0.791595920920372, | |
| "rewards/cosine_scaled_reward": -0.028064551996067166, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2378.3334197998047, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.561937153339386, | |
| "kl": 0.29791259765625, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": -0.0015, | |
| "reward": 0.24677492817863822, | |
| "reward_std": 0.749111071228981, | |
| "rewards/cosine_scaled_reward": -0.10133864358067513, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 2921.854202270508, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.8485268950462341, | |
| "kl": 0.461669921875, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0307, | |
| "reward": 0.025166813982650638, | |
| "reward_std": 0.8601982519030571, | |
| "rewards/cosine_scaled_reward": -0.17313844989985228, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 2087.3125762939453, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 1.4564896821975708, | |
| "kl": 0.282867431640625, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0946, | |
| "reward": 0.7901451410725713, | |
| "reward_std": 1.1320747658610344, | |
| "rewards/cosine_scaled_reward": 0.2687780649284832, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 2330.562599182129, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.6082496047019958, | |
| "kl": 0.316375732421875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0174, | |
| "reward": 0.3098383769392967, | |
| "reward_std": 0.7229567170143127, | |
| "rewards/cosine_scaled_reward": -0.0652941414155066, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 2724.479217529297, | |
| "epoch": 0.488, | |
| "grad_norm": 0.6990248560905457, | |
| "kl": 0.45989990234375, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0519, | |
| "reward": 0.12407399946823716, | |
| "reward_std": 0.8184213675558567, | |
| "rewards/cosine_scaled_reward": -0.0571793733688537, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 2934.5834045410156, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.8835780620574951, | |
| "kl": 0.461181640625, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0673, | |
| "reward": 0.10110859386622906, | |
| "reward_std": 0.9170898050069809, | |
| "rewards/cosine_scaled_reward": -0.1494620693847537, | |
| "rewards/format_reward": 0.6666666902601719, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 2098.0208892822266, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.4786301255226135, | |
| "kl": 0.250732421875, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0115, | |
| "reward": 0.21683326549828053, | |
| "reward_std": 0.8974504359066486, | |
| "rewards/cosine_scaled_reward": -0.14522514073178172, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 2277.187530517578, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.34657105803489685, | |
| "kl": 0.2308349609375, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0271, | |
| "reward": 0.20404995302669704, | |
| "reward_std": 0.6838134452700615, | |
| "rewards/cosine_scaled_reward": -0.08210572600364685, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 2592.3125915527344, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.40773648023605347, | |
| "kl": 0.3669281005859375, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0348, | |
| "reward": -0.10748124029487371, | |
| "reward_std": 0.5141724869608879, | |
| "rewards/cosine_scaled_reward": -0.23057417757809162, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 2727.416748046875, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.3802461326122284, | |
| "kl": 0.365966796875, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0608, | |
| "reward": 0.1675979122519493, | |
| "reward_std": 0.601953960955143, | |
| "rewards/cosine_scaled_reward": -0.07371543161571026, | |
| "rewards/format_reward": 0.6458333376795053, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 2820.9791870117188, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.9824896454811096, | |
| "kl": 0.38507080078125, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0532, | |
| "reward": 0.24698512954637408, | |
| "reward_std": 0.9584920853376389, | |
| "rewards/cosine_scaled_reward": -0.006244649179279804, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2987.625045776367, | |
| "epoch": 0.496, | |
| "grad_norm": 1.5248559713363647, | |
| "kl": 0.41156005859375, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0052, | |
| "reward": -0.08143354021012783, | |
| "reward_std": 0.48334776237607, | |
| "rewards/cosine_scaled_reward": -0.25419083051383495, | |
| "rewards/format_reward": 0.6875000037252903, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 2147.1250228881836, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.3896443545818329, | |
| "kl": 0.2529144287109375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0041, | |
| "reward": 0.21179522573947906, | |
| "reward_std": 0.6268462352454662, | |
| "rewards/cosine_scaled_reward": -0.12806864827871323, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 2244.6042251586914, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 1.1815028190612793, | |
| "kl": 0.34857177734375, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": -0.0171, | |
| "reward": 0.31779580656439066, | |
| "reward_std": 0.8181377947330475, | |
| "rewards/cosine_scaled_reward": -0.02664483431726694, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2895.5834045410156, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.6824524402618408, | |
| "kl": 0.3638916015625, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.053, | |
| "reward": 0.1753413761034608, | |
| "reward_std": 0.7372228689491749, | |
| "rewards/cosine_scaled_reward": -0.11190539970993996, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2927.729232788086, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.5597608685493469, | |
| "kl": 0.362335205078125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0319, | |
| "reward": 0.1679457863792777, | |
| "reward_std": 0.7294919807463884, | |
| "rewards/cosine_scaled_reward": -0.09090281999669969, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2691.5833740234375, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.8128771781921387, | |
| "kl": 0.296539306640625, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0088, | |
| "reward": -0.08755548892077059, | |
| "reward_std": 0.49066366255283356, | |
| "rewards/cosine_scaled_reward": -0.1950508914887905, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 3066.7292404174805, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 1.3051230907440186, | |
| "kl": 0.438079833984375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0168, | |
| "reward": -0.09066830575466156, | |
| "reward_std": 0.5652249306440353, | |
| "rewards/cosine_scaled_reward": -0.1592588908970356, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2851.2500610351562, | |
| "epoch": 0.504, | |
| "grad_norm": 0.5024367570877075, | |
| "kl": 0.3612060546875, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0408, | |
| "reward": 0.41364969685673714, | |
| "reward_std": 0.8325144313275814, | |
| "rewards/cosine_scaled_reward": 0.01983257569372654, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2756.0833892822266, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.634526789188385, | |
| "kl": 0.311614990234375, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0151, | |
| "reward": 0.31202031567227095, | |
| "reward_std": 0.8749045357108116, | |
| "rewards/cosine_scaled_reward": -0.0003691236488521099, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 3059.8125534057617, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.5289911031723022, | |
| "kl": 0.453643798828125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0414, | |
| "reward": 0.029406734742224216, | |
| "reward_std": 0.8291610702872276, | |
| "rewards/cosine_scaled_reward": -0.1101669161580503, | |
| "rewards/format_reward": 0.500000013038516, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 2669.6876068115234, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 1.0344839096069336, | |
| "kl": 0.39227294921875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0173, | |
| "reward": 0.18073289468884468, | |
| "reward_std": 0.8002647534012794, | |
| "rewards/cosine_scaled_reward": -0.05391998961567879, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 2744.3959045410156, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 1.1817784309387207, | |
| "kl": 0.28948974609375, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0885, | |
| "reward": 0.130654014647007, | |
| "reward_std": 0.9245680868625641, | |
| "rewards/cosine_scaled_reward": -0.07616318017244339, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2821.354217529297, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.5576800107955933, | |
| "kl": 0.2890625, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0212, | |
| "reward": 0.15429426170885563, | |
| "reward_std": 0.7137273997068405, | |
| "rewards/cosine_scaled_reward": -0.09584356285631657, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 2405.583396911621, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.211807519197464, | |
| "kl": 0.246002197265625, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0362, | |
| "reward": -0.02742259856313467, | |
| "reward_std": 0.662248931825161, | |
| "rewards/cosine_scaled_reward": -0.2382231242954731, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 2420.5834045410156, | |
| "epoch": 0.512, | |
| "grad_norm": 0.43571776151657104, | |
| "kl": 0.217529296875, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0407, | |
| "reward": 0.2491408372297883, | |
| "reward_std": 0.7298646531999111, | |
| "rewards/cosine_scaled_reward": -0.05479799164459109, | |
| "rewards/format_reward": 0.7083333563059568, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 2219.5000610351562, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.36097243428230286, | |
| "kl": 0.170135498046875, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0368, | |
| "reward": 0.2152203669102164, | |
| "reward_std": 0.7272625118494034, | |
| "rewards/cosine_scaled_reward": -0.09725791588425636, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2734.604217529297, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.21459351480007172, | |
| "kl": 0.228668212890625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0388, | |
| "reward": 0.31540712295100093, | |
| "reward_std": 0.6263695955276489, | |
| "rewards/cosine_scaled_reward": -0.024456078186631203, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 2182.0000610351562, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 1.156898021697998, | |
| "kl": 0.1932373046875, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0423, | |
| "reward": 0.4187639909796417, | |
| "reward_std": 0.828635673969984, | |
| "rewards/cosine_scaled_reward": -0.02044131373986602, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 2950.0833892822266, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 1.0129191875457764, | |
| "kl": 0.2197265625, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0467, | |
| "reward": 0.22497872763779014, | |
| "reward_std": 0.7840702906250954, | |
| "rewards/cosine_scaled_reward": -9.69860702753067e-05, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 2666.812545776367, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.9255622029304504, | |
| "kl": 0.2947998046875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0568, | |
| "reward": 0.23635170864872634, | |
| "reward_std": 0.9474616125226021, | |
| "rewards/cosine_scaled_reward": 0.014322125818580389, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 2574.166732788086, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.32647454738616943, | |
| "kl": 0.2278289794921875, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0413, | |
| "reward": -0.001333402469754219, | |
| "reward_std": 0.6261133253574371, | |
| "rewards/cosine_scaled_reward": -0.1682842280715704, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 3117.4375915527344, | |
| "epoch": 0.52, | |
| "grad_norm": 0.5304046869277954, | |
| "kl": 0.443359375, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0768, | |
| "reward": -0.15152850991580635, | |
| "reward_std": 0.6764501072466373, | |
| "rewards/cosine_scaled_reward": -0.2339008767157793, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 2992.0000610351562, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.8478217720985413, | |
| "kl": 0.326171875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0617, | |
| "reward": 0.046928441151976585, | |
| "reward_std": 0.8120874315500259, | |
| "rewards/cosine_scaled_reward": -0.0800070259720087, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 2832.1875610351562, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.8208078145980835, | |
| "kl": 0.300048828125, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0135, | |
| "reward": 0.3024125434458256, | |
| "reward_std": 0.7796786315739155, | |
| "rewards/cosine_scaled_reward": 0.018373452126979828, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2559.0834045410156, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.155639886856079, | |
| "kl": 0.3125, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0601, | |
| "reward": -0.02342590008629486, | |
| "reward_std": 0.6464426964521408, | |
| "rewards/cosine_scaled_reward": -0.1930369706824422, | |
| "rewards/format_reward": 0.6250000167638063, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 2455.3125228881836, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.5656007528305054, | |
| "kl": 0.265380859375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0203, | |
| "reward": 0.17724867910146713, | |
| "reward_std": 0.7673604302108288, | |
| "rewards/cosine_scaled_reward": -0.0773997250944376, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2945.979248046875, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.9485638737678528, | |
| "kl": 0.40087890625, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0091, | |
| "reward": -0.21174129098653793, | |
| "reward_std": 0.6470157653093338, | |
| "rewards/cosine_scaled_reward": -0.2377923596650362, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2731.500045776367, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.1951149702072144, | |
| "kl": 0.3209228515625, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0507, | |
| "reward": 0.395471319090575, | |
| "reward_std": 1.0323380753397942, | |
| "rewards/cosine_scaled_reward": 0.06153442489448935, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 2310.6875610351562, | |
| "epoch": 0.528, | |
| "grad_norm": 0.764068603515625, | |
| "kl": 0.26934814453125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0606, | |
| "reward": 0.05179802142083645, | |
| "reward_std": 0.5869512222707272, | |
| "rewards/cosine_scaled_reward": -0.16689125541597605, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2988.8959197998047, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.6601057052612305, | |
| "kl": 0.36944580078125, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0391, | |
| "reward": 0.3766809217631817, | |
| "reward_std": 1.0442971400916576, | |
| "rewards/cosine_scaled_reward": 0.0020326152443885803, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 2086.9584045410156, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.5062448382377625, | |
| "kl": 0.2507781982421875, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0197, | |
| "reward": 0.4759128368459642, | |
| "reward_std": 0.610221728682518, | |
| "rewards/cosine_scaled_reward": 0.0023420676589012146, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 2885.041732788086, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.7739923596382141, | |
| "kl": 0.41680908203125, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0037, | |
| "reward": 0.2254231304395944, | |
| "reward_std": 0.9453909136354923, | |
| "rewards/cosine_scaled_reward": -0.08590294234454632, | |
| "rewards/format_reward": 0.7083333563059568, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 2756.520896911621, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 1.5458998680114746, | |
| "kl": 0.32745361328125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.1051, | |
| "reward": 0.3231991082429886, | |
| "reward_std": 0.8955418467521667, | |
| "rewards/cosine_scaled_reward": 0.024495400488376617, | |
| "rewards/format_reward": 0.625000013038516, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 2878.729248046875, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.46346038579940796, | |
| "kl": 0.31219482421875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0296, | |
| "reward": 0.08808497712016106, | |
| "reward_std": 0.6584981977939606, | |
| "rewards/cosine_scaled_reward": -0.19319903245195746, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2945.479248046875, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.5377303957939148, | |
| "kl": 0.395263671875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0565, | |
| "reward": 0.3272525854408741, | |
| "reward_std": 0.7806177064776421, | |
| "rewards/cosine_scaled_reward": 0.046021029353141785, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2482.9167251586914, | |
| "epoch": 0.536, | |
| "grad_norm": 0.6872317790985107, | |
| "kl": 0.414398193359375, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.05, | |
| "reward": 0.3257849495857954, | |
| "reward_std": 0.8201456405222416, | |
| "rewards/cosine_scaled_reward": -0.05236888420768082, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 3126.6458892822266, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.1381498575210571, | |
| "kl": 0.4842529296875, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0271, | |
| "reward": 0.055752304033376276, | |
| "reward_std": 0.8491252809762955, | |
| "rewards/cosine_scaled_reward": -0.12250774865970016, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 2754.291778564453, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.48720309138298035, | |
| "kl": 0.3826904296875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0524, | |
| "reward": 0.5262258416041732, | |
| "reward_std": 0.9357166737318039, | |
| "rewards/cosine_scaled_reward": 0.09029024560004473, | |
| "rewards/format_reward": 0.7708333544433117, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2644.812545776367, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.3475642800331116, | |
| "kl": 0.34210205078125, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0365, | |
| "reward": 0.23837368440581486, | |
| "reward_std": 0.8128412887454033, | |
| "rewards/cosine_scaled_reward": -0.07736600749194622, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2967.229232788086, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.6972779631614685, | |
| "kl": 0.3824462890625, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0307, | |
| "reward": 0.10205891542136669, | |
| "reward_std": 0.7758991979062557, | |
| "rewards/cosine_scaled_reward": -0.09507068432867527, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2673.6875610351562, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.45455074310302734, | |
| "kl": 0.395111083984375, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.039, | |
| "reward": 0.46767894667573273, | |
| "reward_std": 0.8645204231142998, | |
| "rewards/cosine_scaled_reward": 0.09637415563338436, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 2484.291717529297, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.712739109992981, | |
| "kl": 0.348419189453125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0527, | |
| "reward": 0.3686076030135155, | |
| "reward_std": 0.916143324226141, | |
| "rewards/cosine_scaled_reward": 0.012207330204546452, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 3011.8959045410156, | |
| "epoch": 0.544, | |
| "grad_norm": 1.0881688594818115, | |
| "kl": 0.4395751953125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0524, | |
| "reward": 0.3218140173703432, | |
| "reward_std": 1.0662804022431374, | |
| "rewards/cosine_scaled_reward": -0.025444235419854522, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 2656.3751068115234, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 2.0791175365448, | |
| "kl": 0.4376220703125, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0965, | |
| "reward": 0.2924130540341139, | |
| "reward_std": 0.976725772023201, | |
| "rewards/cosine_scaled_reward": 0.013521750457584858, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 2819.812545776367, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.5398616194725037, | |
| "kl": 0.363677978515625, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0443, | |
| "reward": -0.004572154954075813, | |
| "reward_std": 0.6425527259707451, | |
| "rewards/cosine_scaled_reward": -0.13902169838547707, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2858.562545776367, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.6435410976409912, | |
| "kl": 0.378143310546875, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0261, | |
| "reward": 0.18186998274177313, | |
| "reward_std": 0.8897733464837074, | |
| "rewards/cosine_scaled_reward": -0.12430705223232508, | |
| "rewards/format_reward": 0.7291666846722364, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2607.62508392334, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.8115129470825195, | |
| "kl": 0.40704345703125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.043, | |
| "reward": 0.002752909902483225, | |
| "reward_std": 0.6639882102608681, | |
| "rewards/cosine_scaled_reward": -0.1971071765292436, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 3112.166748046875, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.5421264171600342, | |
| "kl": 0.4429931640625, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0528, | |
| "reward": 0.04194536246359348, | |
| "reward_std": 0.6818186715245247, | |
| "rewards/cosine_scaled_reward": -0.13939355686306953, | |
| "rewards/format_reward": 0.6041666902601719, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2706.7084045410156, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.4543374478816986, | |
| "kl": 0.368194580078125, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0506, | |
| "reward": 0.4122487809509039, | |
| "reward_std": 0.7574451714754105, | |
| "rewards/cosine_scaled_reward": 0.056680090725421906, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2878.229217529297, | |
| "epoch": 0.552, | |
| "grad_norm": 0.6022698879241943, | |
| "kl": 0.40606689453125, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0356, | |
| "reward": -0.09259207546710968, | |
| "reward_std": 0.6557772308588028, | |
| "rewards/cosine_scaled_reward": -0.19897521962411702, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2701.9375610351562, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.3102695941925049, | |
| "kl": 0.356781005859375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0726, | |
| "reward": 0.3319150470197201, | |
| "reward_std": 1.0350141674280167, | |
| "rewards/cosine_scaled_reward": -0.02834635879844427, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 2762.916717529297, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.8539823293685913, | |
| "kl": 0.47900390625, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.08, | |
| "reward": 0.18864674912765622, | |
| "reward_std": 0.7356266789138317, | |
| "rewards/cosine_scaled_reward": -0.11939917271956801, | |
| "rewards/format_reward": 0.7500000204890966, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 2239.6667404174805, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.3320426940917969, | |
| "kl": 0.263824462890625, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0167, | |
| "reward": 0.2528488418611232, | |
| "reward_std": 0.7695866264402866, | |
| "rewards/cosine_scaled_reward": -0.10691238380968571, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 2169.6666984558105, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.562497615814209, | |
| "kl": 0.274017333984375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0039, | |
| "reward": 0.44721357547678053, | |
| "reward_std": 0.8235045485198498, | |
| "rewards/cosine_scaled_reward": 0.010554181411862373, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 2426.8125534057617, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.2765026092529297, | |
| "kl": 0.3238372802734375, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0469, | |
| "reward": 0.08554835570976138, | |
| "reward_std": 0.5648987516760826, | |
| "rewards/cosine_scaled_reward": -0.19786089658737183, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 3204.875030517578, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 1.2936737537384033, | |
| "kl": 0.4969482421875, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0295, | |
| "reward": -0.17606779746711254, | |
| "reward_std": 0.6055882386863232, | |
| "rewards/cosine_scaled_reward": -0.21996455593034625, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 2257.1458892822266, | |
| "epoch": 0.56, | |
| "grad_norm": 0.4323062598705292, | |
| "kl": 0.3099365234375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0338, | |
| "reward": 0.3065086267888546, | |
| "reward_std": 0.7493411414325237, | |
| "rewards/cosine_scaled_reward": -0.061892845667898655, | |
| "rewards/format_reward": 0.7916666734963655, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2736.312545776367, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.5150272846221924, | |
| "kl": 0.3674163818359375, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0312, | |
| "reward": 0.5249021016061306, | |
| "reward_std": 1.010607898235321, | |
| "rewards/cosine_scaled_reward": 0.1508561042137444, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2490.8958740234375, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 1.4419853687286377, | |
| "kl": 0.42706298828125, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0032, | |
| "reward": 0.1767475767992437, | |
| "reward_std": 0.777750164270401, | |
| "rewards/cosine_scaled_reward": -0.07436079788021743, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2550.9375762939453, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.4972594380378723, | |
| "kl": 0.450469970703125, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0431, | |
| "reward": 0.6753305066376925, | |
| "reward_std": 0.8794010616838932, | |
| "rewards/cosine_scaled_reward": 0.15572454407811165, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 2323.479232788086, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.5891183614730835, | |
| "kl": 0.3460693359375, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0271, | |
| "reward": 0.6445839628577232, | |
| "reward_std": 1.0191630199551582, | |
| "rewards/cosine_scaled_reward": 0.12392374624687363, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 2892.916717529297, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.4543853998184204, | |
| "kl": 0.45623779296875, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0451, | |
| "reward": 0.18997809663414955, | |
| "reward_std": 0.8969040662050247, | |
| "rewards/cosine_scaled_reward": -0.05571722239255905, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 2114.500045776367, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 1.2619980573654175, | |
| "kl": 0.32281494140625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0048, | |
| "reward": 0.488508015871048, | |
| "reward_std": 0.7361417338252068, | |
| "rewards/cosine_scaled_reward": 0.10424756724387407, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 2322.916679382324, | |
| "epoch": 0.568, | |
| "grad_norm": 0.9984971284866333, | |
| "kl": 0.2513885498046875, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": -0.0203, | |
| "reward": 0.6254424216458574, | |
| "reward_std": 0.7260546460747719, | |
| "rewards/cosine_scaled_reward": 0.1703429389744997, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2773.166748046875, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.5907127857208252, | |
| "kl": 0.4683837890625, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0642, | |
| "reward": 0.36813890002667904, | |
| "reward_std": 0.9811634942889214, | |
| "rewards/cosine_scaled_reward": -0.0031640082597732544, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 2583.5834350585938, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.35024699568748474, | |
| "kl": 0.358978271484375, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0401, | |
| "reward": 0.21864662691950798, | |
| "reward_std": 0.8943019956350327, | |
| "rewards/cosine_scaled_reward": -0.07930864673107862, | |
| "rewards/format_reward": 0.6875000037252903, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 3055.6666870117188, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 1.0981477499008179, | |
| "kl": 0.4345703125, | |
| "learning_rate": 1e-07, | |
| "loss": 0.023, | |
| "reward": 0.14122199080884457, | |
| "reward_std": 0.8392615914344788, | |
| "rewards/cosine_scaled_reward": -0.031122979940846562, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.038267850877775345, | |
| "train_runtime": 72125.5591, | |
| "train_samples_per_second": 0.333, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |