Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
open-r1
trl
grpo
conversational
text-generation-inference
Instructions to use LLucass/Ours_Dr with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use LLucass/Ours_Dr with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="LLucass/Ours_Dr") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("LLucass/Ours_Dr") model = AutoModelForCausalLM.from_pretrained("LLucass/Ours_Dr") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use LLucass/Ours_Dr with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "LLucass/Ours_Dr" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LLucass/Ours_Dr", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/LLucass/Ours_Dr
- SGLang
How to use LLucass/Ours_Dr with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "LLucass/Ours_Dr" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LLucass/Ours_Dr", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "LLucass/Ours_Dr" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LLucass/Ours_Dr", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use LLucass/Ours_Dr with Docker Model Runner:
docker model run hf.co/LLucass/Ours_Dr
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.11428571428571428, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2700.4271850585938, | |
| "cov_mean": -2.6832926778297406e-05, | |
| "cov_std": 0.24635104648768902, | |
| "entropy": 0.36865234375, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.35615867376327515, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0696, | |
| "reward": 0.7604166893288493, | |
| "reward_std": 0.4268697127699852, | |
| "rewards/accuracy_reward": 0.25000001303851604, | |
| "rewards/format_reward": 0.5104166669771075, | |
| "step": 1, | |
| "w_high_ratio": 0.2208261415362358, | |
| "w_low_ratio": 0.027151118498295546, | |
| "w_max": 2.1915207505226135, | |
| "w_mean": 1.4711343348026276, | |
| "w_min": 1.404075949984986e-37, | |
| "w_std": 0.24041971936821938 | |
| }, | |
| { | |
| "completion_length": 3127.3958435058594, | |
| "cov_mean": -1.8215427189716138e-05, | |
| "cov_std": 0.18336841650307178, | |
| "entropy": 0.353515625, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.18010225892066956, | |
| "kl": 0.0, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0533, | |
| "reward": 0.6458333637565374, | |
| "reward_std": 0.4249730706214905, | |
| "rewards/accuracy_reward": 0.2812500102445483, | |
| "rewards/format_reward": 0.3645833386108279, | |
| "step": 2, | |
| "w_high_ratio": 0.05701034888625145, | |
| "w_low_ratio": 0.023528859252110124, | |
| "w_max": 1.811183512210846, | |
| "w_mean": 1.2113382518291473, | |
| "w_min": 0.0, | |
| "w_std": 0.15613791532814503 | |
| }, | |
| { | |
| "completion_length": 3691.0626220703125, | |
| "cov_mean": 2.796226033296989e-05, | |
| "cov_std": 0.1637928392738104, | |
| "entropy": 0.44189453125, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.1356951743364334, | |
| "kl": 3.916025161743164e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.052, | |
| "reward": 0.19791667256504297, | |
| "reward_std": 0.3607826754450798, | |
| "rewards/accuracy_reward": 0.05208333395421505, | |
| "rewards/format_reward": 0.14583333674818277, | |
| "step": 3, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.02235229848884046, | |
| "w_max": 1.460817277431488, | |
| "w_mean": 1.082369714975357, | |
| "w_min": 3.3280838527714405e-44, | |
| "w_std": 0.12309953197836876 | |
| }, | |
| { | |
| "completion_length": 2353.2709350585938, | |
| "cov_mean": 1.0425418167869793e-05, | |
| "cov_std": 0.3036706894636154, | |
| "entropy": 0.41259765625, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.190170019865036, | |
| "kl": 3.3348798751831055e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0459, | |
| "reward": 0.8750000149011612, | |
| "reward_std": 0.5107106417417526, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 4, | |
| "w_high_ratio": 0.2652290016412735, | |
| "w_low_ratio": 0.034206886775791645, | |
| "w_max": 2.106997400522232, | |
| "w_mean": 1.5420070886611938, | |
| "w_min": 2.4617042843759845e-36, | |
| "w_std": 0.2812090367078781 | |
| }, | |
| { | |
| "completion_length": 3485.1771850585938, | |
| "cov_mean": 3.2382055223934003e-06, | |
| "cov_std": 0.29665667191147804, | |
| "entropy": 0.4609375, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.2197088897228241, | |
| "kl": 4.2125582695007324e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0803, | |
| "reward": 0.46875001303851604, | |
| "reward_std": 0.5515270829200745, | |
| "rewards/accuracy_reward": 0.1145833358168602, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 5, | |
| "w_high_ratio": 0.008333034813404083, | |
| "w_low_ratio": 0.04545952333137393, | |
| "w_max": 1.5202394425868988, | |
| "w_mean": 1.1503158807754517, | |
| "w_min": 5.693325166185118e-29, | |
| "w_std": 0.23378031328320503 | |
| }, | |
| { | |
| "completion_length": 3451.2500610351562, | |
| "cov_mean": -4.464495305001037e-05, | |
| "cov_std": 0.236886378377676, | |
| "entropy": 0.46142578125, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.13218647241592407, | |
| "kl": 4.482269287109375e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0517, | |
| "reward": 0.3645833507180214, | |
| "reward_std": 0.515114888548851, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.28125000186264515, | |
| "step": 6, | |
| "w_high_ratio": 0.0625, | |
| "w_low_ratio": 0.031897591426968575, | |
| "w_max": 1.5891262888908386, | |
| "w_mean": 1.1359511613845825, | |
| "w_min": 0.0, | |
| "w_std": 0.14766533859074116 | |
| }, | |
| { | |
| "completion_length": 3224.3125610351562, | |
| "cov_mean": 4.349886694399174e-06, | |
| "cov_std": 0.3991788253188133, | |
| "entropy": 0.38671875, | |
| "epoch": 0.008, | |
| "grad_norm": 0.22412240505218506, | |
| "kl": 2.1651387214660645e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0742, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.6870906725525856, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 7, | |
| "w_high_ratio": 0.047733694314956665, | |
| "w_low_ratio": 0.053672163281589746, | |
| "w_max": 1.5988431572914124, | |
| "w_mean": 1.2651265263557434, | |
| "w_min": 6.1929912552473734e-37, | |
| "w_std": 0.2889493927359581 | |
| }, | |
| { | |
| "completion_length": 2800.9583740234375, | |
| "cov_mean": 1.0622998161124997e-06, | |
| "cov_std": 0.15430260822176933, | |
| "entropy": 0.33740234375, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.11328813433647156, | |
| "kl": 1.7002224922180176e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0184, | |
| "reward": 0.8958333730697632, | |
| "reward_std": 0.25296592339873314, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.4895833432674408, | |
| "step": 8, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.01592865912243724, | |
| "w_max": 1.5333127677440643, | |
| "w_mean": 1.2431240677833557, | |
| "w_min": 0.25, | |
| "w_std": 0.11287659406661987 | |
| }, | |
| { | |
| "completion_length": 3369.791748046875, | |
| "cov_mean": -2.2509159407491097e-05, | |
| "cov_std": 0.20683829113841057, | |
| "entropy": 0.45263671875, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.1632954180240631, | |
| "kl": 4.3064355850219727e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0417, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.3903508894145489, | |
| "rewards/accuracy_reward": 0.1145833358168602, | |
| "rewards/format_reward": 0.3437500074505806, | |
| "step": 9, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.029203591868281364, | |
| "w_max": 1.6281995177268982, | |
| "w_mean": 1.1540252268314362, | |
| "w_min": 1.9273542721946577e-23, | |
| "w_std": 0.15844954177737236 | |
| }, | |
| { | |
| "completion_length": 2794.2291870117188, | |
| "cov_mean": 7.258828873091261e-06, | |
| "cov_std": 0.22622444108128548, | |
| "entropy": 0.34716796875, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.11491074413061142, | |
| "kl": 2.664327621459961e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0765, | |
| "reward": 0.6145833432674408, | |
| "reward_std": 0.4795500487089157, | |
| "rewards/accuracy_reward": 0.16666667070239782, | |
| "rewards/format_reward": 0.4479166828095913, | |
| "step": 10, | |
| "w_high_ratio": 0.1683393381536007, | |
| "w_low_ratio": 0.03143396740779281, | |
| "w_max": 1.8651617467403412, | |
| "w_mean": 1.2822044789791107, | |
| "w_min": 2.2624703592113335e-38, | |
| "w_std": 0.20682579837739468 | |
| }, | |
| { | |
| "completion_length": 3703.197998046875, | |
| "cov_mean": -2.311449361513951e-05, | |
| "cov_std": 0.18362887762486935, | |
| "entropy": 0.39697265625, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.12180526554584503, | |
| "kl": 2.73287296295166e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0401, | |
| "reward": 0.26041667722165585, | |
| "reward_std": 0.3744332268834114, | |
| "rewards/accuracy_reward": 0.10416667256504297, | |
| "rewards/format_reward": 0.15625000558793545, | |
| "step": 11, | |
| "w_high_ratio": 0.027614232152700424, | |
| "w_low_ratio": 0.024976021610200405, | |
| "w_max": 1.3482708036899567, | |
| "w_mean": 1.0835402309894562, | |
| "w_min": 0.25, | |
| "w_std": 0.12668619584292173 | |
| }, | |
| { | |
| "completion_length": 2611.260498046875, | |
| "cov_mean": 1.0368909215685562e-05, | |
| "cov_std": 0.22750693373382092, | |
| "entropy": 0.3984375, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.1790972799062729, | |
| "kl": 2.802908420562744e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0576, | |
| "reward": 0.7395833805203438, | |
| "reward_std": 0.4462515264749527, | |
| "rewards/accuracy_reward": 0.1354166716337204, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 12, | |
| "w_high_ratio": 0.10206323117017746, | |
| "w_low_ratio": 0.0320228124037385, | |
| "w_max": 2.1756480634212494, | |
| "w_mean": 1.4741427898406982, | |
| "w_min": 4.420129648185005e-23, | |
| "w_std": 0.23598888516426086 | |
| }, | |
| { | |
| "completion_length": 3224.041748046875, | |
| "cov_mean": -2.2661331968265586e-05, | |
| "cov_std": 0.15935274586081505, | |
| "entropy": 0.38427734375, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.24569930136203766, | |
| "kl": 1.7702579498291016e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0498, | |
| "reward": 0.604166679084301, | |
| "reward_std": 0.30622391402721405, | |
| "rewards/accuracy_reward": 0.21875000558793545, | |
| "rewards/format_reward": 0.385416679084301, | |
| "step": 13, | |
| "w_high_ratio": 0.20555464923381805, | |
| "w_low_ratio": 0.01726952870376408, | |
| "w_max": 1.8901410400867462, | |
| "w_mean": 1.31855970621109, | |
| "w_min": 1.1411503146395655e-35, | |
| "w_std": 0.14450976066291332 | |
| }, | |
| { | |
| "completion_length": 3125.197998046875, | |
| "cov_mean": -2.9986793833813863e-05, | |
| "cov_std": 0.18112273141741753, | |
| "entropy": 0.3623046875, | |
| "epoch": 0.016, | |
| "grad_norm": 0.14652569591999054, | |
| "kl": 7.249414920806885e-06, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0596, | |
| "reward": 0.5000000111758709, | |
| "reward_std": 0.3975026085972786, | |
| "rewards/accuracy_reward": 0.13541667070239782, | |
| "rewards/format_reward": 0.3645833432674408, | |
| "step": 14, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.026353970635682344, | |
| "w_max": 1.6898008584976196, | |
| "w_mean": 1.1696374714374542, | |
| "w_min": 2.786338774190119e-34, | |
| "w_std": 0.17131789773702621 | |
| }, | |
| { | |
| "completion_length": 2945.3959350585938, | |
| "cov_mean": 3.4355500702076824e-06, | |
| "cov_std": 0.18097041826695204, | |
| "entropy": 0.37109375, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.08377102017402649, | |
| "kl": 2.804398536682129e-05, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0282, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.37770550325512886, | |
| "rewards/accuracy_reward": 0.2395833395421505, | |
| "rewards/format_reward": 0.4479166716337204, | |
| "step": 15, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.027025693794712424, | |
| "w_max": 1.4225987792015076, | |
| "w_mean": 1.1180275976657867, | |
| "w_min": 8.951121255272305e-16, | |
| "w_std": 0.14841708727180958 | |
| }, | |
| { | |
| "completion_length": 3842.3646240234375, | |
| "cov_mean": -3.1302830393542536e-05, | |
| "cov_std": 0.16068686172366142, | |
| "entropy": 0.458984375, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.1221930980682373, | |
| "kl": 2.4199485778808594e-05, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0431, | |
| "reward": 0.1562500037252903, | |
| "reward_std": 0.3155686669051647, | |
| "rewards/accuracy_reward": 0.05208333395421505, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 16, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.022678226232528687, | |
| "w_max": 1.1968038976192474, | |
| "w_mean": 1.0266980826854706, | |
| "w_min": 0.25, | |
| "w_std": 0.10604305937886238 | |
| }, | |
| { | |
| "completion_length": 2433.1875915527344, | |
| "cov_mean": 2.4951528757810593e-05, | |
| "cov_std": 0.27749199233949184, | |
| "entropy": 0.44970703125, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.13208113610744476, | |
| "kl": 5.91278076171875e-05, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": -0.0048, | |
| "reward": 0.8854166865348816, | |
| "reward_std": 0.4504813477396965, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.6145833358168602, | |
| "step": 17, | |
| "w_high_ratio": 0.171035997569561, | |
| "w_low_ratio": 0.03585993289016187, | |
| "w_max": 2.199991285800934, | |
| "w_mean": 1.4317797720432281, | |
| "w_min": 0.25, | |
| "w_std": 0.24799126759171486 | |
| }, | |
| { | |
| "completion_length": 3167.4791870117188, | |
| "cov_mean": -2.742706919889315e-05, | |
| "cov_std": 0.2498251087963581, | |
| "entropy": 0.369140625, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.15079385042190552, | |
| "kl": 2.0952895283699036e-05, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0577, | |
| "reward": 0.5729166939854622, | |
| "reward_std": 0.5097959190607071, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.4062500149011612, | |
| "step": 18, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.034274401143193245, | |
| "w_max": 1.457490622997284, | |
| "w_mean": 1.1403506994247437, | |
| "w_min": 0.0, | |
| "w_std": 0.16276290826499462 | |
| }, | |
| { | |
| "completion_length": 3139.635498046875, | |
| "cov_mean": 1.529891596874222e-05, | |
| "cov_std": 0.13174043968319893, | |
| "entropy": 0.39208984375, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.08939936012029648, | |
| "kl": 5.739927291870117e-05, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0233, | |
| "reward": 0.8541666828095913, | |
| "reward_std": 0.3213166669011116, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 19, | |
| "w_high_ratio": 0.035190850496292114, | |
| "w_low_ratio": 0.016721592284739017, | |
| "w_max": 1.647162914276123, | |
| "w_mean": 1.2506683766841888, | |
| "w_min": 0.25, | |
| "w_std": 0.09711403585970402 | |
| }, | |
| { | |
| "completion_length": 2464.385498046875, | |
| "cov_mean": 3.157450896651426e-05, | |
| "cov_std": 0.283736914396286, | |
| "entropy": 0.3369140625, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.3488742411136627, | |
| "kl": 9.429454803466797e-05, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0914, | |
| "reward": 0.9375000596046448, | |
| "reward_std": 0.4943716749548912, | |
| "rewards/accuracy_reward": 0.2604166716337204, | |
| "rewards/format_reward": 0.6770833656191826, | |
| "step": 20, | |
| "w_high_ratio": 0.0566110759973526, | |
| "w_low_ratio": 0.031579687260091305, | |
| "w_max": 2.3184494078159332, | |
| "w_mean": 1.4481623768806458, | |
| "w_min": 0.0, | |
| "w_std": 0.28026906587183475 | |
| }, | |
| { | |
| "completion_length": 2847.21875, | |
| "cov_mean": -2.243259518763807e-05, | |
| "cov_std": 0.18684318475425243, | |
| "entropy": 0.423828125, | |
| "epoch": 0.024, | |
| "grad_norm": 0.13671471178531647, | |
| "kl": 0.00033351778984069824, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0498, | |
| "reward": 0.6562500204890966, | |
| "reward_std": 0.38981083035469055, | |
| "rewards/accuracy_reward": 0.18750000465661287, | |
| "rewards/format_reward": 0.4687500027939677, | |
| "step": 21, | |
| "w_high_ratio": 0.08715118188410997, | |
| "w_low_ratio": 0.021108672255650163, | |
| "w_max": 1.9608261287212372, | |
| "w_mean": 1.387522131204605, | |
| "w_min": 4.576730842832761e-23, | |
| "w_std": 0.14338573440909386 | |
| }, | |
| { | |
| "completion_length": 1849.3542175292969, | |
| "cov_mean": -5.1019123930018395e-05, | |
| "cov_std": 0.208794716745615, | |
| "entropy": 0.3994140625, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.1823095828294754, | |
| "kl": 0.00039577484130859375, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0503, | |
| "reward": 1.1041666716337204, | |
| "reward_std": 0.363413717597723, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 22, | |
| "w_high_ratio": 0.3683718554675579, | |
| "w_low_ratio": 0.028410385129973292, | |
| "w_max": 2.5447845458984375, | |
| "w_mean": 1.7170847058296204, | |
| "w_min": 1.7296456515038733e-32, | |
| "w_std": 0.18113290891051292 | |
| }, | |
| { | |
| "completion_length": 2786.604217529297, | |
| "cov_mean": 6.156731569717522e-05, | |
| "cov_std": 0.21581846103072166, | |
| "entropy": 0.3828125, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.10202132165431976, | |
| "kl": 0.00020551681518554688, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0436, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.47307053953409195, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 23, | |
| "w_high_ratio": 0.05155515298247337, | |
| "w_low_ratio": 0.03038623696193099, | |
| "w_max": 1.8245242238044739, | |
| "w_mean": 1.2686880826950073, | |
| "w_min": 2.0108632963061125e-43, | |
| "w_std": 0.19650068879127502 | |
| }, | |
| { | |
| "completion_length": 2932.14599609375, | |
| "cov_mean": 3.685860542645969e-05, | |
| "cov_std": 0.19567562174052, | |
| "entropy": 0.35986328125, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.11536505818367004, | |
| "kl": 0.00012372806668281555, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0259, | |
| "reward": 0.8229167014360428, | |
| "reward_std": 0.41623104363679886, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/format_reward": 0.5520833507180214, | |
| "step": 24, | |
| "w_high_ratio": 0.05616182088851929, | |
| "w_low_ratio": 0.025760386954061687, | |
| "w_max": 1.7301380336284637, | |
| "w_mean": 1.2399356663227081, | |
| "w_min": 0.25, | |
| "w_std": 0.16614723671227694 | |
| }, | |
| { | |
| "completion_length": 2980.104248046875, | |
| "cov_mean": 5.5617931593587855e-06, | |
| "cov_std": 0.20383853651583195, | |
| "entropy": 0.43408203125, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.13895268738269806, | |
| "kl": 0.0003798753023147583, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0328, | |
| "reward": 0.6145833609625697, | |
| "reward_std": 0.4326799139380455, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.4062500102445483, | |
| "step": 25, | |
| "w_high_ratio": 0.15471260249614716, | |
| "w_low_ratio": 0.027910931850783527, | |
| "w_max": 1.878886878490448, | |
| "w_mean": 1.3457823991775513, | |
| "w_min": 2.1938871947043534e-19, | |
| "w_std": 0.21430648770183325 | |
| }, | |
| { | |
| "completion_length": 3121.822998046875, | |
| "cov_mean": -1.5570902263561948e-05, | |
| "cov_std": 0.12479476444423199, | |
| "entropy": 0.41845703125, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.055989839136600494, | |
| "kl": 5.704164505004883e-05, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.006, | |
| "reward": 0.7395833656191826, | |
| "reward_std": 0.2778088226914406, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.4895833358168602, | |
| "step": 26, | |
| "w_high_ratio": 0.05233287438750267, | |
| "w_low_ratio": 0.016803464153781533, | |
| "w_max": 1.667470008134842, | |
| "w_mean": 1.2069356143474579, | |
| "w_min": 1.8282741064045888e-40, | |
| "w_std": 0.12309898342937231 | |
| }, | |
| { | |
| "completion_length": 3419.697998046875, | |
| "cov_mean": -8.16069814391085e-06, | |
| "cov_std": 0.2326441928744316, | |
| "entropy": 0.45654296875, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.1154065951704979, | |
| "kl": 0.0001367814838886261, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0629, | |
| "reward": 0.4687500223517418, | |
| "reward_std": 0.4522514268755913, | |
| "rewards/accuracy_reward": 0.11458333861082792, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 27, | |
| "w_high_ratio": 0.125, | |
| "w_low_ratio": 0.03899317281320691, | |
| "w_max": 1.7131148278713226, | |
| "w_mean": 1.2755843102931976, | |
| "w_min": 2.3244726053753363e-31, | |
| "w_std": 0.15411211177706718 | |
| }, | |
| { | |
| "completion_length": 3003.3334350585938, | |
| "cov_mean": 3.4548415897006635e-06, | |
| "cov_std": 0.18210824206471443, | |
| "entropy": 0.40576171875, | |
| "epoch": 0.032, | |
| "grad_norm": 0.10329318046569824, | |
| "kl": 0.00033906102180480957, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0466, | |
| "reward": 0.729166679084301, | |
| "reward_std": 0.4190382733941078, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 28, | |
| "w_high_ratio": 0.045407865196466446, | |
| "w_low_ratio": 0.020685997209511697, | |
| "w_max": 1.9191896319389343, | |
| "w_mean": 1.28102046251297, | |
| "w_min": 1.1079171471645686e-36, | |
| "w_std": 0.15209556370973587 | |
| }, | |
| { | |
| "completion_length": 3622.8438110351562, | |
| "cov_mean": -1.4215014289220562e-05, | |
| "cov_std": 0.19959762692451477, | |
| "entropy": 0.43701171875, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.11793094128370285, | |
| "kl": 0.00043398141860961914, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0686, | |
| "reward": 0.2812500149011612, | |
| "reward_std": 0.343124657869339, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.2187500074505806, | |
| "step": 29, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03135715611279011, | |
| "w_max": 1.389756977558136, | |
| "w_mean": 1.1044960916042328, | |
| "w_min": 0.5, | |
| "w_std": 0.15529824048280716 | |
| }, | |
| { | |
| "completion_length": 3211.6563110351562, | |
| "cov_mean": 2.0011442074974184e-05, | |
| "cov_std": 0.3438211902976036, | |
| "entropy": 0.40087890625, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.15425726771354675, | |
| "kl": 0.0005748271942138672, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.096, | |
| "reward": 0.7708333544433117, | |
| "reward_std": 0.6762835085391998, | |
| "rewards/accuracy_reward": 0.2812500139698386, | |
| "rewards/format_reward": 0.489583358168602, | |
| "step": 30, | |
| "w_high_ratio": 0.0955454632639885, | |
| "w_low_ratio": 0.04459251323714852, | |
| "w_max": 1.7270594835281372, | |
| "w_mean": 1.2695180475711823, | |
| "w_min": 0.0, | |
| "w_std": 0.22970640659332275 | |
| }, | |
| { | |
| "completion_length": 3313.7500610351562, | |
| "cov_mean": -6.734976523148362e-07, | |
| "cov_std": 0.1561581064015627, | |
| "entropy": 0.38623046875, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.12495917081832886, | |
| "kl": 0.00024247169494628906, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0659, | |
| "reward": 0.47916669212281704, | |
| "reward_std": 0.3414399288594723, | |
| "rewards/accuracy_reward": 0.16666667722165585, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 31, | |
| "w_high_ratio": 0.12074629962444305, | |
| "w_low_ratio": 0.02205055020749569, | |
| "w_max": 1.9694485068321228, | |
| "w_mean": 1.323824942111969, | |
| "w_min": 3.479372530225627e-30, | |
| "w_std": 0.15388164669275284 | |
| }, | |
| { | |
| "completion_length": 3430.5521850585938, | |
| "cov_mean": -1.8880080915550934e-05, | |
| "cov_std": 0.24603740125894547, | |
| "entropy": 0.4443359375, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.10446158051490784, | |
| "kl": 0.00040030479431152344, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0417, | |
| "reward": 0.6875000223517418, | |
| "reward_std": 0.4970519095659256, | |
| "rewards/accuracy_reward": 0.2604166679084301, | |
| "rewards/format_reward": 0.4270833507180214, | |
| "step": 32, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03472677152603865, | |
| "w_max": 1.578925609588623, | |
| "w_mean": 1.1632550954818726, | |
| "w_min": 0.0, | |
| "w_std": 0.17994992434978485 | |
| }, | |
| { | |
| "completion_length": 3569.229248046875, | |
| "cov_mean": -1.2864127711509354e-05, | |
| "cov_std": 0.21655914932489395, | |
| "entropy": 0.3828125, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.11952047049999237, | |
| "kl": 0.00048720836639404297, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0213, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.4569981172680855, | |
| "rewards/accuracy_reward": 0.229166679084301, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 33, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03175507392734289, | |
| "w_max": 1.3123357892036438, | |
| "w_mean": 1.0974721312522888, | |
| "w_min": 0.25, | |
| "w_std": 0.16161495074629784 | |
| }, | |
| { | |
| "completion_length": 2714.0000610351562, | |
| "cov_mean": -3.014505455212202e-05, | |
| "cov_std": 0.24434123933315277, | |
| "entropy": 0.462890625, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.24054297804832458, | |
| "kl": 0.0010285377502441406, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0264, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.43565599620342255, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 34, | |
| "w_high_ratio": 0.08968023210763931, | |
| "w_low_ratio": 0.03058682754635811, | |
| "w_max": 1.8369105458259583, | |
| "w_mean": 1.3255797028541565, | |
| "w_min": 0.25, | |
| "w_std": 0.23013706505298615 | |
| }, | |
| { | |
| "completion_length": 3206.260498046875, | |
| "cov_mean": 1.827982691793295e-05, | |
| "cov_std": 0.2361072190105915, | |
| "entropy": 0.42578125, | |
| "epoch": 0.04, | |
| "grad_norm": 0.13136403262615204, | |
| "kl": 0.0009332895278930664, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0513, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.47499874979257584, | |
| "rewards/accuracy_reward": 0.1979166679084301, | |
| "rewards/format_reward": 0.3645833469927311, | |
| "step": 35, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03539817640557885, | |
| "w_max": 1.4059478044509888, | |
| "w_mean": 1.1172049045562744, | |
| "w_min": 0.0, | |
| "w_std": 0.1749916821718216 | |
| }, | |
| { | |
| "completion_length": 3699.416748046875, | |
| "cov_mean": -1.4997711559772142e-05, | |
| "cov_std": 0.20064959302544594, | |
| "entropy": 0.50439453125, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.1509845107793808, | |
| "kl": 0.0011619925498962402, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0429, | |
| "reward": 0.2500000102445483, | |
| "reward_std": 0.41391417384147644, | |
| "rewards/accuracy_reward": 0.031250000931322575, | |
| "rewards/format_reward": 0.21875000279396772, | |
| "step": 36, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.029611330712214112, | |
| "w_max": 1.3176401853561401, | |
| "w_mean": 1.0739335417747498, | |
| "w_min": 0.0, | |
| "w_std": 0.15683909878134727 | |
| }, | |
| { | |
| "completion_length": 3516.1563110351562, | |
| "cov_mean": -2.547230405980372e-05, | |
| "cov_std": 0.11416707932949066, | |
| "entropy": 0.43994140625, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.08760611712932587, | |
| "kl": 0.0007746219635009766, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0156, | |
| "reward": 0.22916667442768812, | |
| "reward_std": 0.19299374520778656, | |
| "rewards/accuracy_reward": 0.010416666977107525, | |
| "rewards/format_reward": 0.21875001024454832, | |
| "step": 37, | |
| "w_high_ratio": 0.05747595056891441, | |
| "w_low_ratio": 0.013685875572264194, | |
| "w_max": 1.6239450573921204, | |
| "w_mean": 1.1775790452957153, | |
| "w_min": 0.25, | |
| "w_std": 0.12097344920039177 | |
| }, | |
| { | |
| "completion_length": 3670.822998046875, | |
| "cov_mean": -4.859739419771358e-06, | |
| "cov_std": 0.11942135915160179, | |
| "entropy": 0.4833984375, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.08295677602291107, | |
| "kl": 0.0007152557373046875, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.016, | |
| "reward": 0.322916679084301, | |
| "reward_std": 0.24508872628211975, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.1770833432674408, | |
| "step": 38, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.015665842220187187, | |
| "w_max": 1.2278587818145752, | |
| "w_mean": 1.0692134499549866, | |
| "w_min": 0.5, | |
| "w_std": 0.08390428125858307 | |
| }, | |
| { | |
| "completion_length": 3133.5521850585938, | |
| "cov_mean": 1.6014040738809854e-05, | |
| "cov_std": 0.15454116463661194, | |
| "entropy": 0.38427734375, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.09236446022987366, | |
| "kl": 0.0011830329895019531, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0191, | |
| "reward": 0.770833358168602, | |
| "reward_std": 0.30482664704322815, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 39, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.021826621610671282, | |
| "w_max": 1.4440618753433228, | |
| "w_mean": 1.1500347554683685, | |
| "w_min": 0.25, | |
| "w_std": 0.10476426035165787 | |
| }, | |
| { | |
| "completion_length": 2921.5938110351562, | |
| "cov_mean": 3.307407860120293e-05, | |
| "cov_std": 0.18591826409101486, | |
| "entropy": 0.4111328125, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.1281704157590866, | |
| "kl": 0.0041351318359375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0448, | |
| "reward": 0.6770833507180214, | |
| "reward_std": 0.39515648037195206, | |
| "rewards/accuracy_reward": 0.17708333395421505, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 40, | |
| "w_high_ratio": 0.09708013385534286, | |
| "w_low_ratio": 0.02820506482385099, | |
| "w_max": 1.9662592709064484, | |
| "w_mean": 1.3416504263877869, | |
| "w_min": 1.7285604841107016e-17, | |
| "w_std": 0.17729274183511734 | |
| }, | |
| { | |
| "completion_length": 3497.4063110351562, | |
| "cov_mean": -7.282104343175888e-05, | |
| "cov_std": 0.2843910865485668, | |
| "entropy": 0.40771484375, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.16035234928131104, | |
| "kl": 0.0008625984191894531, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0447, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.5345464050769806, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.3333333507180214, | |
| "step": 41, | |
| "w_high_ratio": 0.015127741731703281, | |
| "w_low_ratio": 0.04166511259973049, | |
| "w_max": 1.6356081068515778, | |
| "w_mean": 1.1487390100955963, | |
| "w_min": 3.531651517161998e-24, | |
| "w_std": 0.21111097559332848 | |
| }, | |
| { | |
| "completion_length": 3070.854248046875, | |
| "cov_mean": 4.5878337004978675e-06, | |
| "cov_std": 0.0855317497625947, | |
| "entropy": 0.48388671875, | |
| "epoch": 0.048, | |
| "grad_norm": 0.06175260245800018, | |
| "kl": 0.0006914138793945312, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0063, | |
| "reward": 0.3229166716337204, | |
| "reward_std": 0.17353228479623795, | |
| "rewards/accuracy_reward": 0.010416666977107525, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 42, | |
| "w_high_ratio": 0.05438845232129097, | |
| "w_low_ratio": 0.008411283954046667, | |
| "w_max": 1.544800043106079, | |
| "w_mean": 1.1694203615188599, | |
| "w_min": 0.5, | |
| "w_std": 0.07276808470487595 | |
| }, | |
| { | |
| "completion_length": 3378.8125610351562, | |
| "cov_mean": -2.5848277346085524e-05, | |
| "cov_std": 0.2625325694680214, | |
| "entropy": 0.43701171875, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.18838584423065186, | |
| "kl": 0.0014505386352539062, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0805, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.5196144729852676, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 43, | |
| "w_high_ratio": 0.05749715492129326, | |
| "w_low_ratio": 0.038668573601171374, | |
| "w_max": 1.9480818212032318, | |
| "w_mean": 1.1805387139320374, | |
| "w_min": 2.3359345679368763e-34, | |
| "w_std": 0.1971494909375906 | |
| }, | |
| { | |
| "completion_length": 2916.9791717529297, | |
| "cov_mean": -3.640100658230949e-06, | |
| "cov_std": 0.23978274501860142, | |
| "entropy": 0.41162109375, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.16967085003852844, | |
| "kl": 0.005632162094116211, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0373, | |
| "reward": 0.7708333460614085, | |
| "reward_std": 0.43626825511455536, | |
| "rewards/accuracy_reward": 0.2812500074505806, | |
| "rewards/format_reward": 0.4895833386108279, | |
| "step": 44, | |
| "w_high_ratio": 0.11527429521083832, | |
| "w_low_ratio": 0.0326957437209785, | |
| "w_max": 1.8800793588161469, | |
| "w_mean": 1.356241375207901, | |
| "w_min": 0.0, | |
| "w_std": 0.22198213264346123 | |
| }, | |
| { | |
| "completion_length": 3686.1875610351562, | |
| "cov_mean": -1.2132580195611808e-05, | |
| "cov_std": 0.17040352895855904, | |
| "entropy": 0.4228515625, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.0952582135796547, | |
| "kl": 0.0019674301147460938, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0175, | |
| "reward": 0.4687500149011612, | |
| "reward_std": 0.3746139518916607, | |
| "rewards/accuracy_reward": 0.2187500074505806, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 45, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.022616846952587366, | |
| "w_max": 1.2881874740123749, | |
| "w_mean": 1.0595116317272186, | |
| "w_min": 0.25, | |
| "w_std": 0.11086289770901203 | |
| }, | |
| { | |
| "completion_length": 3573.8229370117188, | |
| "cov_mean": -5.21990023116814e-06, | |
| "cov_std": 0.09164197091013193, | |
| "entropy": 0.5361328125, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.061532407999038696, | |
| "kl": 0.0024976730346679688, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0226, | |
| "reward": 0.20833333488553762, | |
| "reward_std": 0.21344273164868355, | |
| "rewards/accuracy_reward": 0.041666666977107525, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 46, | |
| "w_high_ratio": 0.05877559259533882, | |
| "w_low_ratio": 0.011987740639597178, | |
| "w_max": 1.5505282580852509, | |
| "w_mean": 1.1429267823696136, | |
| "w_min": 3.571343117882909e-28, | |
| "w_std": 0.09135792590677738 | |
| }, | |
| { | |
| "completion_length": 3139.9375610351562, | |
| "cov_mean": 1.948155477293767e-05, | |
| "cov_std": 0.3308473080396652, | |
| "entropy": 0.43017578125, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.287928968667984, | |
| "kl": 0.0011713504791259766, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0536, | |
| "reward": 0.8541666716337204, | |
| "reward_std": 0.5766339302062988, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.5312500149011612, | |
| "step": 47, | |
| "w_high_ratio": 0.12357743084430695, | |
| "w_low_ratio": 0.038592321798205376, | |
| "w_max": 2.0394512712955475, | |
| "w_mean": 1.3565464913845062, | |
| "w_min": 0.25, | |
| "w_std": 0.26638074964284897 | |
| }, | |
| { | |
| "completion_length": 3051.385467529297, | |
| "cov_mean": -6.733167197126022e-06, | |
| "cov_std": 0.19061635434627533, | |
| "entropy": 0.4501953125, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.1374855488538742, | |
| "kl": 0.00525665283203125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0486, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.37667082995176315, | |
| "rewards/accuracy_reward": 0.1770833395421505, | |
| "rewards/format_reward": 0.34375000838190317, | |
| "step": 48, | |
| "w_high_ratio": 0.12677159160375595, | |
| "w_low_ratio": 0.023622059728950262, | |
| "w_max": 2.024912714958191, | |
| "w_mean": 1.3519074320793152, | |
| "w_min": 7.271374424684445e-33, | |
| "w_std": 0.19886896945536137 | |
| }, | |
| { | |
| "completion_length": 2580.0521850585938, | |
| "cov_mean": 2.3781666641298216e-05, | |
| "cov_std": 0.2574050724506378, | |
| "entropy": 0.39306640625, | |
| "epoch": 0.056, | |
| "grad_norm": 0.13522955775260925, | |
| "kl": 0.0030527114868164062, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0379, | |
| "reward": 0.8020833730697632, | |
| "reward_std": 0.47789302468299866, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.5937500149011612, | |
| "step": 49, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03599585313349962, | |
| "w_max": 1.5133522152900696, | |
| "w_mean": 1.177164077758789, | |
| "w_min": 3.952712643244228e-41, | |
| "w_std": 0.19642843678593636 | |
| }, | |
| { | |
| "completion_length": 3276.2188110351562, | |
| "cov_mean": 3.7818183841409336e-05, | |
| "cov_std": 0.1878571268171072, | |
| "entropy": 0.36767578125, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.10174579173326492, | |
| "kl": 0.0021190643310546875, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.04, | |
| "reward": 0.5937500102445483, | |
| "reward_std": 0.3714478053152561, | |
| "rewards/accuracy_reward": 0.2604166716337204, | |
| "rewards/format_reward": 0.3333333386108279, | |
| "step": 50, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.0255408501252532, | |
| "w_max": 1.3753422796726227, | |
| "w_mean": 1.1391299068927765, | |
| "w_min": 0.0, | |
| "w_std": 0.12728617619723082 | |
| }, | |
| { | |
| "completion_length": 2626.7084350585938, | |
| "cov_mean": -8.886720934242476e-06, | |
| "cov_std": 0.154384421184659, | |
| "entropy": 0.46826171875, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.10762708634138107, | |
| "kl": 0.006221771240234375, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.038, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.27000611275434494, | |
| "rewards/accuracy_reward": 0.08333333674818277, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 51, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.022287086583673954, | |
| "w_max": 1.522942990064621, | |
| "w_mean": 1.16915962100029, | |
| "w_min": 0.25, | |
| "w_std": 0.10866253450512886 | |
| }, | |
| { | |
| "completion_length": 3225.6875610351562, | |
| "cov_mean": -3.420543362153694e-05, | |
| "cov_std": 0.3140456900000572, | |
| "entropy": 0.41357421875, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.16037067770957947, | |
| "kl": 0.0017061233520507812, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0307, | |
| "reward": 0.833333358168602, | |
| "reward_std": 0.5814172253012657, | |
| "rewards/accuracy_reward": 0.3854166865348816, | |
| "rewards/format_reward": 0.4479166716337204, | |
| "step": 52, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.041595788672566414, | |
| "w_max": 1.4235666990280151, | |
| "w_mean": 1.1377580165863037, | |
| "w_min": 0.25, | |
| "w_std": 0.20165112614631653 | |
| }, | |
| { | |
| "completion_length": 3038.3126220703125, | |
| "cov_mean": 3.059411119465949e-05, | |
| "cov_std": 0.34266950748860836, | |
| "entropy": 0.43994140625, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.18063929677009583, | |
| "kl": 0.004637241363525391, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0228, | |
| "reward": 0.9062500447034836, | |
| "reward_std": 0.5983624011278152, | |
| "rewards/accuracy_reward": 0.3229166828095913, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 53, | |
| "w_high_ratio": 0.0893278568983078, | |
| "w_low_ratio": 0.048161128303036094, | |
| "w_max": 1.605346292257309, | |
| "w_mean": 1.2202682793140411, | |
| "w_min": 3.5021185811519566e-32, | |
| "w_std": 0.2196234930306673 | |
| }, | |
| { | |
| "completion_length": 3048.3021850585938, | |
| "cov_mean": -3.5217308322899044e-05, | |
| "cov_std": 0.40455804020166397, | |
| "entropy": 0.4111328125, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.26634302735328674, | |
| "kl": 0.0014967918395996094, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0485, | |
| "reward": 1.0104167088866234, | |
| "reward_std": 0.6799703985452652, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.5729166828095913, | |
| "step": 54, | |
| "w_high_ratio": 0.14361883699893951, | |
| "w_low_ratio": 0.03988745156675577, | |
| "w_max": 1.914384812116623, | |
| "w_mean": 1.3435330390930176, | |
| "w_min": 0.0, | |
| "w_std": 0.2648167684674263 | |
| }, | |
| { | |
| "completion_length": 3372.3959350585938, | |
| "cov_mean": 5.4979325341264484e-06, | |
| "cov_std": 0.25056118331849575, | |
| "entropy": 0.44189453125, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.0977473184466362, | |
| "kl": 0.0019044876098632812, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0381, | |
| "reward": 0.6562500223517418, | |
| "reward_std": 0.4964308738708496, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/format_reward": 0.3854166716337204, | |
| "step": 55, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.0342027700971812, | |
| "w_max": 1.4320927858352661, | |
| "w_mean": 1.1150383353233337, | |
| "w_min": 0.0, | |
| "w_std": 0.18786128982901573 | |
| }, | |
| { | |
| "completion_length": 3225.2709045410156, | |
| "cov_mean": 5.9108706409460865e-06, | |
| "cov_std": 0.22232061624526978, | |
| "entropy": 0.43115234375, | |
| "epoch": 0.064, | |
| "grad_norm": 0.11878591775894165, | |
| "kl": 0.00154876708984375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0112, | |
| "reward": 0.6458333656191826, | |
| "reward_std": 0.39009611308574677, | |
| "rewards/accuracy_reward": 0.19791666697710752, | |
| "rewards/format_reward": 0.4479166828095913, | |
| "step": 56, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.028656802838668227, | |
| "w_max": 1.5001116394996643, | |
| "w_mean": 1.2033225297927856, | |
| "w_min": 1.9247332911380988e-31, | |
| "w_std": 0.18417230807244778 | |
| }, | |
| { | |
| "completion_length": 3587.6458740234375, | |
| "cov_mean": -4.519502726907376e-05, | |
| "cov_std": 0.24361642450094223, | |
| "entropy": 0.34423828125, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.11000871658325195, | |
| "kl": 0.0006794929504394531, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0423, | |
| "reward": 0.4375, | |
| "reward_std": 0.467288788408041, | |
| "rewards/accuracy_reward": 0.1354166716337204, | |
| "rewards/format_reward": 0.3020833358168602, | |
| "step": 57, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03233239706605673, | |
| "w_max": 1.2505627870559692, | |
| "w_mean": 1.0757884085178375, | |
| "w_min": 0.25, | |
| "w_std": 0.14500370249152184 | |
| }, | |
| { | |
| "completion_length": 2626.947998046875, | |
| "cov_mean": -1.4388041108759353e-05, | |
| "cov_std": 0.2534067742526531, | |
| "entropy": 0.38427734375, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.15062791109085083, | |
| "kl": 0.0038776397705078125, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0665, | |
| "reward": 0.916666716337204, | |
| "reward_std": 0.43652553856372833, | |
| "rewards/accuracy_reward": 0.2812500009313226, | |
| "rewards/format_reward": 0.635416692122817, | |
| "step": 58, | |
| "w_high_ratio": 0.12001378461718559, | |
| "w_low_ratio": 0.03707017982378602, | |
| "w_max": 2.2393843233585358, | |
| "w_mean": 1.5285146832466125, | |
| "w_min": 2.338311714957214e-41, | |
| "w_std": 0.2577071785926819 | |
| }, | |
| { | |
| "completion_length": 3455.0208740234375, | |
| "cov_mean": -2.0748303086293163e-05, | |
| "cov_std": 0.2206678595393896, | |
| "entropy": 0.3994140625, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.1635066419839859, | |
| "kl": 0.0013880729675292969, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0447, | |
| "reward": 0.39583333395421505, | |
| "reward_std": 0.3752100467681885, | |
| "rewards/accuracy_reward": 0.14583333861082792, | |
| "rewards/format_reward": 0.2500000102445483, | |
| "step": 59, | |
| "w_high_ratio": 0.121368907392025, | |
| "w_low_ratio": 0.02756796986795962, | |
| "w_max": 1.8036501705646515, | |
| "w_mean": 1.2463297247886658, | |
| "w_min": 0.25, | |
| "w_std": 0.14911611750721931 | |
| }, | |
| { | |
| "completion_length": 3254.92724609375, | |
| "cov_mean": -1.7401176137354923e-05, | |
| "cov_std": 0.17664196342229843, | |
| "entropy": 0.396484375, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.08288750052452087, | |
| "kl": 0.0025072097778320312, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0281, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.39030885696411133, | |
| "rewards/accuracy_reward": 0.11458333488553762, | |
| "rewards/format_reward": 0.34375000558793545, | |
| "step": 60, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.022112081991508603, | |
| "w_max": 1.5709031820297241, | |
| "w_mean": 1.1311749517917633, | |
| "w_min": 4.6449540846206874e-42, | |
| "w_std": 0.12603357434272766 | |
| }, | |
| { | |
| "completion_length": 3376.6771850585938, | |
| "cov_mean": 2.7930617193305807e-05, | |
| "cov_std": 0.22145200800150633, | |
| "entropy": 0.400390625, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.2130555361509323, | |
| "kl": 0.0013718605041503906, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0505, | |
| "reward": 0.6666666669771075, | |
| "reward_std": 0.4248874858021736, | |
| "rewards/accuracy_reward": 0.1979166716337204, | |
| "rewards/format_reward": 0.4687500102445483, | |
| "step": 61, | |
| "w_high_ratio": 0.02656024508178234, | |
| "w_low_ratio": 0.02897683286573738, | |
| "w_max": 1.5726596117019653, | |
| "w_mean": 1.1504198908805847, | |
| "w_min": 0.25, | |
| "w_std": 0.15992471296340227 | |
| }, | |
| { | |
| "completion_length": 2804.3438110351562, | |
| "cov_mean": 5.110432311994373e-05, | |
| "cov_std": 0.3370564728975296, | |
| "entropy": 0.385986328125, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.20217926800251007, | |
| "kl": 0.0056667327880859375, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0984, | |
| "reward": 0.854166679084301, | |
| "reward_std": 0.6372481435537338, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 62, | |
| "w_high_ratio": 0.15436114370822906, | |
| "w_low_ratio": 0.04324930440634489, | |
| "w_max": 2.3450452983379364, | |
| "w_mean": 1.4747015237808228, | |
| "w_min": 2.7272514644043088e-36, | |
| "w_std": 0.31457675993442535 | |
| }, | |
| { | |
| "completion_length": 2748.1876220703125, | |
| "cov_mean": 9.652720564190531e-06, | |
| "cov_std": 0.3283480554819107, | |
| "entropy": 0.43310546875, | |
| "epoch": 0.072, | |
| "grad_norm": 0.15674079954624176, | |
| "kl": 0.0032906532287597656, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0725, | |
| "reward": 1.031250037252903, | |
| "reward_std": 0.5360563546419144, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/format_reward": 0.6770833432674408, | |
| "step": 63, | |
| "w_high_ratio": 0.0625, | |
| "w_low_ratio": 0.04385069524869323, | |
| "w_max": 1.8759834170341492, | |
| "w_mean": 1.3464274108409882, | |
| "w_min": 0.0, | |
| "w_std": 0.1958361305296421 | |
| }, | |
| { | |
| "completion_length": 3358.3334350585938, | |
| "cov_mean": -4.510660664891475e-05, | |
| "cov_std": 0.2794957533478737, | |
| "entropy": 0.44921875, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.12678052484989166, | |
| "kl": 0.005690097808837891, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0603, | |
| "reward": 0.5937500149011612, | |
| "reward_std": 0.583733007311821, | |
| "rewards/accuracy_reward": 0.2187500037252903, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 64, | |
| "w_high_ratio": 0.044796403497457504, | |
| "w_low_ratio": 0.03829633165150881, | |
| "w_max": 1.6352408528327942, | |
| "w_mean": 1.167921930551529, | |
| "w_min": 2.6764800668604006e-43, | |
| "w_std": 0.20047394558787346 | |
| }, | |
| { | |
| "completion_length": 3026.5209350585938, | |
| "cov_mean": 1.992606485146098e-05, | |
| "cov_std": 0.17664698883891106, | |
| "entropy": 0.38525390625, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.08567796647548676, | |
| "kl": 0.0034112930297851562, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0309, | |
| "reward": 0.6250000260770321, | |
| "reward_std": 0.32293669879436493, | |
| "rewards/accuracy_reward": 0.16666666697710752, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 65, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.029133206233382225, | |
| "w_max": 1.3120096027851105, | |
| "w_mean": 1.1221435964107513, | |
| "w_min": 1.0468715588988584e-22, | |
| "w_std": 0.12845914252102375 | |
| }, | |
| { | |
| "completion_length": 2413.1250610351562, | |
| "cov_mean": -4.743512135974015e-06, | |
| "cov_std": 0.05960770323872566, | |
| "entropy": 0.35693359375, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.08712891489267349, | |
| "kl": 0.0032825469970703125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0194, | |
| "reward": 0.8645833432674408, | |
| "reward_std": 0.13795074447989464, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.4895833358168602, | |
| "step": 66, | |
| "w_high_ratio": 0.04171403869986534, | |
| "w_low_ratio": 0.006300564622506499, | |
| "w_max": 1.5700030624866486, | |
| "w_mean": 1.1926406025886536, | |
| "w_min": 0.5264238715171814, | |
| "w_std": 0.06322706118226051 | |
| }, | |
| { | |
| "completion_length": 3792.9271240234375, | |
| "cov_mean": -9.492634717389592e-06, | |
| "cov_std": 0.12886795960366726, | |
| "entropy": 0.36865234375, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.0828777328133583, | |
| "kl": 0.002822399139404297, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0179, | |
| "reward": 0.21875000558793545, | |
| "reward_std": 0.2869785502552986, | |
| "rewards/accuracy_reward": 0.052083334885537624, | |
| "rewards/format_reward": 0.16666667442768812, | |
| "step": 67, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.017554222606122494, | |
| "w_max": 1.1846114993095398, | |
| "w_mean": 1.0396882444620132, | |
| "w_min": 0.25, | |
| "w_std": 0.09070100169628859 | |
| }, | |
| { | |
| "completion_length": 2557.6146240234375, | |
| "cov_mean": 1.5981570413714508e-05, | |
| "cov_std": 0.31408151611685753, | |
| "entropy": 0.44384765625, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.20466886460781097, | |
| "kl": 0.0033721923828125, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0566, | |
| "reward": 0.8020833507180214, | |
| "reward_std": 0.5168246552348137, | |
| "rewards/accuracy_reward": 0.2395833432674408, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 68, | |
| "w_high_ratio": 0.04399501532316208, | |
| "w_low_ratio": 0.042061637388542295, | |
| "w_max": 1.8962246477603912, | |
| "w_mean": 1.305674433708191, | |
| "w_min": 4.959685019058809e-39, | |
| "w_std": 0.2475343719124794 | |
| }, | |
| { | |
| "completion_length": 3009.7084350585938, | |
| "cov_mean": -4.52714293714962e-05, | |
| "cov_std": 0.23395150154829025, | |
| "entropy": 0.5517578125, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.18218518793582916, | |
| "kl": 0.014312744140625, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0752, | |
| "reward": 0.4687500149011612, | |
| "reward_std": 0.43293242901563644, | |
| "rewards/accuracy_reward": 0.09375000279396772, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 69, | |
| "w_high_ratio": 0.057301439344882965, | |
| "w_low_ratio": 0.0333517212420702, | |
| "w_max": 1.9855602085590363, | |
| "w_mean": 1.2992196083068848, | |
| "w_min": 7.707141553786494e-45, | |
| "w_std": 0.19260139763355255 | |
| }, | |
| { | |
| "completion_length": 3280.0833740234375, | |
| "cov_mean": 2.8561088129208656e-05, | |
| "cov_std": 0.19498306885361671, | |
| "entropy": 0.3720703125, | |
| "epoch": 0.08, | |
| "grad_norm": 0.10543849319219589, | |
| "kl": 0.010352134704589844, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0185, | |
| "reward": 0.5416667014360428, | |
| "reward_std": 0.3840207904577255, | |
| "rewards/accuracy_reward": 0.1250000074505806, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 70, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.02808787301182747, | |
| "w_max": 1.2882064878940582, | |
| "w_mean": 1.0968604385852814, | |
| "w_min": 0.25, | |
| "w_std": 0.1326066516339779 | |
| }, | |
| { | |
| "completion_length": 2855.7708740234375, | |
| "cov_mean": 4.004434208582097e-05, | |
| "cov_std": 0.13545112498104572, | |
| "entropy": 0.42724609375, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.11269883066415787, | |
| "kl": 0.014951705932617188, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0154, | |
| "reward": 0.5729166967794299, | |
| "reward_std": 0.2492993399500847, | |
| "rewards/accuracy_reward": 0.1979166716337204, | |
| "rewards/format_reward": 0.3750000027939677, | |
| "step": 71, | |
| "w_high_ratio": 0.08807118237018585, | |
| "w_low_ratio": 0.01654834917280823, | |
| "w_max": 1.8029770255088806, | |
| "w_mean": 1.2483810186386108, | |
| "w_min": 0.25, | |
| "w_std": 0.1558239422738552 | |
| }, | |
| { | |
| "completion_length": 3550.322998046875, | |
| "cov_mean": -4.4116359276813455e-07, | |
| "cov_std": 0.24292385205626488, | |
| "entropy": 0.5107421875, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.20913389325141907, | |
| "kl": 0.0040874481201171875, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0498, | |
| "reward": 0.3750000102445483, | |
| "reward_std": 0.40810926631093025, | |
| "rewards/accuracy_reward": 0.0729166716337204, | |
| "rewards/format_reward": 0.3020833386108279, | |
| "step": 72, | |
| "w_high_ratio": 0.033293891698122025, | |
| "w_low_ratio": 0.03450615704059601, | |
| "w_max": 1.6246004700660706, | |
| "w_mean": 1.1392557322978973, | |
| "w_min": 2.138569791073276e-36, | |
| "w_std": 0.19269496202468872 | |
| }, | |
| { | |
| "completion_length": 3837.1146240234375, | |
| "cov_mean": 2.1203804863034748e-06, | |
| "cov_std": 0.15915799140930176, | |
| "entropy": 0.51171875, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.08118956536054611, | |
| "kl": 0.0014677047729492188, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0146, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.25903886556625366, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 73, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.020282023586332798, | |
| "w_max": 1.1690112948417664, | |
| "w_mean": 1.027027040719986, | |
| "w_min": 0.5, | |
| "w_std": 0.09119972214102745 | |
| }, | |
| { | |
| "completion_length": 3503.1250610351562, | |
| "cov_mean": -4.496445217228029e-05, | |
| "cov_std": 0.2548239603638649, | |
| "entropy": 0.41748046875, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.1568731665611267, | |
| "kl": 0.00296783447265625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0655, | |
| "reward": 0.5625000074505806, | |
| "reward_std": 0.4641239196062088, | |
| "rewards/accuracy_reward": 0.2604166679084301, | |
| "rewards/format_reward": 0.3020833358168602, | |
| "step": 74, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.033767144195735455, | |
| "w_max": 1.4706333875656128, | |
| "w_mean": 1.078809916973114, | |
| "w_min": 0.25, | |
| "w_std": 0.1507711410522461 | |
| }, | |
| { | |
| "completion_length": 3346.1875610351562, | |
| "cov_mean": -7.328241736104246e-06, | |
| "cov_std": 0.20871411636471748, | |
| "entropy": 0.404296875, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.10099554806947708, | |
| "kl": 0.0059261322021484375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0554, | |
| "reward": 0.572916679084301, | |
| "reward_std": 0.3865407630801201, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.3854166716337204, | |
| "step": 75, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.032186293974518776, | |
| "w_max": 1.5220047235488892, | |
| "w_mean": 1.1460089683532715, | |
| "w_min": 0.25, | |
| "w_std": 0.17850109934806824 | |
| }, | |
| { | |
| "completion_length": 3092.635498046875, | |
| "cov_mean": 4.942317445966182e-06, | |
| "cov_std": 0.20054961927235126, | |
| "entropy": 0.44091796875, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.14435574412345886, | |
| "kl": 0.001827239990234375, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.02, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.34913603961467743, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.4791666939854622, | |
| "step": 76, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.030099061783403158, | |
| "w_max": 1.574487328529358, | |
| "w_mean": 1.184053212404251, | |
| "w_min": 1.2891045632755887e-30, | |
| "w_std": 0.1306541245430708 | |
| }, | |
| { | |
| "completion_length": 3414.0938110351562, | |
| "cov_mean": 4.712834697784274e-05, | |
| "cov_std": 0.23044732213020325, | |
| "entropy": 0.45263671875, | |
| "epoch": 0.088, | |
| "grad_norm": 0.27278050780296326, | |
| "kl": 0.0013208389282226562, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.015, | |
| "reward": 0.510416672565043, | |
| "reward_std": 0.4236603006720543, | |
| "rewards/accuracy_reward": 0.1354166716337204, | |
| "rewards/format_reward": 0.37500001583248377, | |
| "step": 77, | |
| "w_high_ratio": 0.006981382612138987, | |
| "w_low_ratio": 0.02834776253439486, | |
| "w_max": 1.5653101801872253, | |
| "w_mean": 1.1680251359939575, | |
| "w_min": 5.385388925526598e-27, | |
| "w_std": 0.172462142072618 | |
| }, | |
| { | |
| "completion_length": 3528.5001220703125, | |
| "cov_mean": -2.2849541437608423e-05, | |
| "cov_std": 0.24372886680066586, | |
| "entropy": 0.41015625, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.13359344005584717, | |
| "kl": 0.0022611618041992188, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.037, | |
| "reward": 0.6562500186264515, | |
| "reward_std": 0.5271749570965767, | |
| "rewards/accuracy_reward": 0.2812500111758709, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 78, | |
| "w_high_ratio": 0.041247133165597916, | |
| "w_low_ratio": 0.027018944965675473, | |
| "w_max": 1.6266585290431976, | |
| "w_mean": 1.1576823890209198, | |
| "w_min": 1.890902738208876e-34, | |
| "w_std": 0.16164034884423018 | |
| }, | |
| { | |
| "completion_length": 2676.187545776367, | |
| "cov_mean": 1.6489982044731732e-05, | |
| "cov_std": 0.21785889007151127, | |
| "entropy": 0.34912109375, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.1319928616285324, | |
| "kl": 0.0022878646850585938, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0289, | |
| "reward": 0.843750037252903, | |
| "reward_std": 0.4268086552619934, | |
| "rewards/accuracy_reward": 0.26041667349636555, | |
| "rewards/format_reward": 0.5833333544433117, | |
| "step": 79, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03145516477525234, | |
| "w_max": 1.4992458820343018, | |
| "w_mean": 1.168209046125412, | |
| "w_min": 0.0, | |
| "w_std": 0.1453277636319399 | |
| }, | |
| { | |
| "completion_length": 3573.8021240234375, | |
| "cov_mean": -9.463059541303664e-06, | |
| "cov_std": 0.1826024018228054, | |
| "entropy": 0.48974609375, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.0996597409248352, | |
| "kl": 0.002468109130859375, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0271, | |
| "reward": 0.520833358168602, | |
| "reward_std": 0.3810138627886772, | |
| "rewards/accuracy_reward": 0.1979166679084301, | |
| "rewards/format_reward": 0.3229166753590107, | |
| "step": 80, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.02435835381038487, | |
| "w_max": 1.368900626897812, | |
| "w_mean": 1.106232464313507, | |
| "w_min": 0.25, | |
| "w_std": 0.11319147422909737 | |
| }, | |
| { | |
| "completion_length": 3308.625, | |
| "cov_mean": -1.9006092770723626e-05, | |
| "cov_std": 0.1842699982225895, | |
| "entropy": 0.58740234375, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.15561415255069733, | |
| "kl": 0.005096435546875, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0328, | |
| "reward": 0.4062500149011612, | |
| "reward_std": 0.28905032202601433, | |
| "rewards/accuracy_reward": 0.11458333395421505, | |
| "rewards/format_reward": 0.291666679084301, | |
| "step": 81, | |
| "w_high_ratio": 0.125, | |
| "w_low_ratio": 0.02731443475931883, | |
| "w_max": 1.497319370508194, | |
| "w_mean": 1.1954041719436646, | |
| "w_min": 0.25, | |
| "w_std": 0.12777045369148254 | |
| }, | |
| { | |
| "completion_length": 3054.5938110351562, | |
| "cov_mean": -1.190575176224229e-05, | |
| "cov_std": 0.13467486761510372, | |
| "entropy": 0.453125, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.06870616227388382, | |
| "kl": 0.0037078857421875, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0283, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.2705298960208893, | |
| "rewards/accuracy_reward": 0.2187500074505806, | |
| "rewards/format_reward": 0.4270833432674408, | |
| "step": 82, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.016570631880313158, | |
| "w_max": 1.6605907380580902, | |
| "w_mean": 1.2332959175109863, | |
| "w_min": 0.25, | |
| "w_std": 0.10085548926144838 | |
| }, | |
| { | |
| "completion_length": 3103.7188110351562, | |
| "cov_mean": -1.1289954500171007e-05, | |
| "cov_std": 0.25786374136805534, | |
| "entropy": 0.4951171875, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.11641041934490204, | |
| "kl": 0.0033721923828125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0561, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.5305610671639442, | |
| "rewards/accuracy_reward": 0.17708333488553762, | |
| "rewards/format_reward": 0.3645833507180214, | |
| "step": 83, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03598734503611922, | |
| "w_max": 1.3528369665145874, | |
| "w_mean": 1.095271646976471, | |
| "w_min": 0.0, | |
| "w_std": 0.1604925710707903 | |
| }, | |
| { | |
| "completion_length": 3278.104248046875, | |
| "cov_mean": 3.2929374356172048e-06, | |
| "cov_std": 0.246146522462368, | |
| "entropy": 0.45849609375, | |
| "epoch": 0.096, | |
| "grad_norm": 0.15934637188911438, | |
| "kl": 0.0013265609741210938, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0172, | |
| "reward": 0.7187500298023224, | |
| "reward_std": 0.432245634496212, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.4062500149011612, | |
| "step": 84, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.02858129981905222, | |
| "w_max": 1.45015150308609, | |
| "w_mean": 1.134983777999878, | |
| "w_min": 0.25, | |
| "w_std": 0.15726573020219803 | |
| }, | |
| { | |
| "completion_length": 3484.729248046875, | |
| "cov_mean": -8.481749773636693e-06, | |
| "cov_std": 0.29308537393808365, | |
| "entropy": 0.37841796875, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.1379634290933609, | |
| "kl": 0.0017528533935546875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0623, | |
| "reward": 0.5104166716337204, | |
| "reward_std": 0.6195737272500992, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.3437500037252903, | |
| "step": 85, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.03877481259405613, | |
| "w_max": 1.4117690026760101, | |
| "w_mean": 1.119108110666275, | |
| "w_min": 0.0, | |
| "w_std": 0.18827635422348976 | |
| }, | |
| { | |
| "completion_length": 3158.8125, | |
| "cov_mean": 6.843166598713424e-06, | |
| "cov_std": 0.12266075890511274, | |
| "entropy": 0.47119140625, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.09657198935747147, | |
| "kl": 0.0029735565185546875, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0174, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.2581377625465393, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 86, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.018843807047232985, | |
| "w_max": 1.3829069435596466, | |
| "w_mean": 1.131670981645584, | |
| "w_min": 0.25, | |
| "w_std": 0.11318285018205643 | |
| }, | |
| { | |
| "completion_length": 3175.2188720703125, | |
| "cov_mean": -5.853003926858946e-05, | |
| "cov_std": 0.30259813368320465, | |
| "entropy": 0.556640625, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.38111400604248047, | |
| "kl": 0.007335662841796875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0778, | |
| "reward": 0.7187500149011612, | |
| "reward_std": 0.5092682540416718, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.4895833432674408, | |
| "step": 87, | |
| "w_high_ratio": 0.11183382570743561, | |
| "w_low_ratio": 0.03777051903307438, | |
| "w_max": 1.9268704950809479, | |
| "w_mean": 1.334629088640213, | |
| "w_min": 5.605193857299268e-45, | |
| "w_std": 0.27019689977169037 | |
| }, | |
| { | |
| "completion_length": 3190.9375610351562, | |
| "cov_mean": 7.67477886256529e-05, | |
| "cov_std": 0.46318161487579346, | |
| "entropy": 0.5048828125, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.2448384165763855, | |
| "kl": 0.020694732666015625, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0906, | |
| "reward": 0.7604167014360428, | |
| "reward_std": 0.7656450867652893, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.4687500149011612, | |
| "step": 88, | |
| "w_high_ratio": 0.08163053542375565, | |
| "w_low_ratio": 0.06242929771542549, | |
| "w_max": 1.8597923815250397, | |
| "w_mean": 1.2850928604602814, | |
| "w_min": 2.421886516239847e-28, | |
| "w_std": 0.3251073509454727 | |
| }, | |
| { | |
| "completion_length": 3597.2396850585938, | |
| "cov_mean": -4.437502229848178e-05, | |
| "cov_std": 0.29206302016973495, | |
| "entropy": 0.45703125, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.16698718070983887, | |
| "kl": 0.00389862060546875, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0647, | |
| "reward": 0.4687500149011612, | |
| "reward_std": 0.5414880514144897, | |
| "rewards/accuracy_reward": 0.1770833395421505, | |
| "rewards/format_reward": 0.291666679084301, | |
| "step": 89, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.04101241147145629, | |
| "w_max": 1.4729963839054108, | |
| "w_mean": 1.0995305478572845, | |
| "w_min": 0.0, | |
| "w_std": 0.19675205275416374 | |
| }, | |
| { | |
| "completion_length": 2862.125, | |
| "cov_mean": -5.355579560273327e-06, | |
| "cov_std": 0.0925431028008461, | |
| "entropy": 0.638671875, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.09346118569374084, | |
| "kl": 0.01725006103515625, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0269, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.19776283204555511, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 90, | |
| "w_high_ratio": 0.12039810419082642, | |
| "w_low_ratio": 0.014701983891427517, | |
| "w_max": 2.006953328847885, | |
| "w_mean": 1.3530822694301605, | |
| "w_min": 0.5, | |
| "w_std": 0.09125572815537453 | |
| }, | |
| { | |
| "completion_length": 3527.2813110351562, | |
| "cov_mean": -6.6539573708723765e-06, | |
| "cov_std": 0.19490721449255943, | |
| "entropy": 0.47802734375, | |
| "epoch": 0.104, | |
| "grad_norm": 0.16878993809223175, | |
| "kl": 0.0045948028564453125, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0023, | |
| "reward": 0.48958333395421505, | |
| "reward_std": 0.31570227444171906, | |
| "rewards/accuracy_reward": 0.17708333861082792, | |
| "rewards/format_reward": 0.3125000027939677, | |
| "step": 91, | |
| "w_high_ratio": 0.0432400144636631, | |
| "w_low_ratio": 0.025078749749809504, | |
| "w_max": 1.4252241849899292, | |
| "w_mean": 1.1511092782020569, | |
| "w_min": 0.25, | |
| "w_std": 0.14859570004045963 | |
| }, | |
| { | |
| "completion_length": 2996.8333740234375, | |
| "cov_mean": 5.000362762075383e-06, | |
| "cov_std": 0.274563018232584, | |
| "entropy": 0.44921875, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.2672693729400635, | |
| "kl": 0.011915206909179688, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0151, | |
| "reward": 0.6875000223517418, | |
| "reward_std": 0.3997742757201195, | |
| "rewards/accuracy_reward": 0.1562500074505806, | |
| "rewards/format_reward": 0.5312500074505806, | |
| "step": 92, | |
| "w_high_ratio": 0.057363301515579224, | |
| "w_low_ratio": 0.03872442920692265, | |
| "w_max": 1.9733782410621643, | |
| "w_mean": 1.2820636332035065, | |
| "w_min": 1.1237891646640876e-26, | |
| "w_std": 0.21614115312695503 | |
| }, | |
| { | |
| "completion_length": 3868.3333740234375, | |
| "cov_mean": 2.6024475801023073e-05, | |
| "cov_std": 0.11580366268754005, | |
| "entropy": 0.59130859375, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.07178976386785507, | |
| "kl": 0.00457763671875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0217, | |
| "reward": 0.10416666883975267, | |
| "reward_std": 0.23858631029725075, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 93, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.022031503496691585, | |
| "w_max": 1.2298710346221924, | |
| "w_mean": 1.0156493484973907, | |
| "w_min": 9.954022423630139e-25, | |
| "w_std": 0.08076347131282091 | |
| }, | |
| { | |
| "completion_length": 3368.0834350585938, | |
| "cov_mean": -9.980105346585333e-07, | |
| "cov_std": 0.14382942207157612, | |
| "entropy": 0.56640625, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.09354749321937561, | |
| "kl": 0.010030746459960938, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0167, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.26436545327305794, | |
| "rewards/accuracy_reward": 0.15625000651925802, | |
| "rewards/format_reward": 0.3229166679084301, | |
| "step": 94, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.020133810699917376, | |
| "w_max": 1.4812421798706055, | |
| "w_mean": 1.140279084444046, | |
| "w_min": 2.8643974210955766e-17, | |
| "w_std": 0.10274781100451946 | |
| }, | |
| { | |
| "completion_length": 3727.229248046875, | |
| "cov_mean": 2.8429121812223457e-07, | |
| "cov_std": 0.1946401260793209, | |
| "entropy": 0.4609375, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.09702739864587784, | |
| "kl": 0.0018458366394042969, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.039, | |
| "reward": 0.2916666707023978, | |
| "reward_std": 0.4342379942536354, | |
| "rewards/accuracy_reward": 0.07291666977107525, | |
| "rewards/format_reward": 0.21875000838190317, | |
| "step": 95, | |
| "w_high_ratio": 0.0, | |
| "w_low_ratio": 0.027697827550582588, | |
| "w_max": 1.2496315836906433, | |
| "w_mean": 1.0440161526203156, | |
| "w_min": 0.0, | |
| "w_std": 0.13094050344079733 | |
| }, | |
| { | |
| "completion_length": 3174.6876220703125, | |
| "cov_mean": 0.00011378643011994427, | |
| "cov_std": 0.20376956462860107, | |
| "entropy": 0.4482421875, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.13061358034610748, | |
| "kl": 0.005021095275878906, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.007, | |
| "reward": 0.6562500074505806, | |
| "reward_std": 0.3372773453593254, | |
| "rewards/accuracy_reward": 0.2812500037252903, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 96, | |
| "w_high_ratio": 0.08483665436506271, | |
| "w_low_ratio": 0.023257225286215544, | |
| "w_max": 1.6405883729457855, | |
| "w_mean": 1.170585960149765, | |
| "w_min": 0.25, | |
| "w_std": 0.1635773852467537 | |
| }, | |
| { | |
| "completion_length": 3578.6458740234375, | |
| "cov_mean": 5.654522146869567e-05, | |
| "cov_std": 0.2582091810181737, | |
| "entropy": 0.4716796875, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.23674072325229645, | |
| "kl": 0.0026683807373046875, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0282, | |
| "reward": 0.48958334419876337, | |
| "reward_std": 0.4029072895646095, | |
| "rewards/accuracy_reward": 0.21875000558793545, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 97, | |
| "w_high_ratio": 0.0427275113761425, | |
| "w_low_ratio": 0.02712295390665531, | |
| "w_max": 1.5456224977970123, | |
| "w_mean": 1.134882390499115, | |
| "w_min": 8.233329127140463e-42, | |
| "w_std": 0.17803996708244085 | |
| }, | |
| { | |
| "completion_length": 3275.70849609375, | |
| "cov_mean": -2.5506165911792777e-05, | |
| "cov_std": 0.22730276361107826, | |
| "entropy": 0.43359375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.154100239276886, | |
| "kl": 0.0016241073608398438, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0791, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.45262154936790466, | |
| "rewards/accuracy_reward": 0.14583333674818277, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 98, | |
| "w_high_ratio": 0.031526632606983185, | |
| "w_low_ratio": 0.030860408674925566, | |
| "w_max": 1.7042989134788513, | |
| "w_mean": 1.219407707452774, | |
| "w_min": 1.0509738482436128e-44, | |
| "w_std": 0.1900232806801796 | |
| }, | |
| { | |
| "completion_length": 3084.593795776367, | |
| "cov_mean": -6.495133902717498e-06, | |
| "cov_std": 0.13577165454626083, | |
| "entropy": 0.386474609375, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.07463299483060837, | |
| "kl": 0.008008956909179688, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0185, | |
| "reward": 0.5208333488553762, | |
| "reward_std": 0.2819661721587181, | |
| "rewards/accuracy_reward": 0.19791667815297842, | |
| "rewards/format_reward": 0.3229166669771075, | |
| "step": 99, | |
| "w_high_ratio": 0.04817802831530571, | |
| "w_low_ratio": 0.017340978607535362, | |
| "w_max": 1.389538049697876, | |
| "w_mean": 1.1735607981681824, | |
| "w_min": 0.25, | |
| "w_std": 0.1109000938013196 | |
| }, | |
| { | |
| "completion_length": 3224.5834350585938, | |
| "cov_mean": -4.5620060973305954e-05, | |
| "cov_std": 0.26372817903757095, | |
| "entropy": 0.42724609375, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.13250450789928436, | |
| "kl": 0.009317398071289062, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0352, | |
| "reward": 0.7083333656191826, | |
| "reward_std": 0.4755344055593014, | |
| "rewards/accuracy_reward": 0.260416679084301, | |
| "rewards/format_reward": 0.447916679084301, | |
| "step": 100, | |
| "w_high_ratio": 0.05065765231847763, | |
| "w_low_ratio": 0.03619965072721243, | |
| "w_max": 1.6908635199069977, | |
| "w_mean": 1.1874340772628784, | |
| "w_min": 1.2465886678904214e-38, | |
| "w_std": 0.2075340449810028 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "step": 100, | |
| "total_flos": 0.0, | |
| "train_loss": 0.041623960277065636, | |
| "train_runtime": 8415.8875, | |
| "train_samples_per_second": 1.141, | |
| "train_steps_per_second": 0.012 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |