JOY0021 commited on Apr 26

Commit

1c6eae6

verified ·

1 Parent(s): 2ce2593

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +2 -0
README.md +67 -0
checkpoint-25/chat_template.jinja +54 -0
checkpoint-25/config.json +57 -0
checkpoint-25/generation_config.json +13 -0
checkpoint-25/model.safetensors +3 -0
checkpoint-25/optimizer.pt +3 -0
checkpoint-25/rng_state.pth +3 -0
checkpoint-25/scheduler.pt +3 -0
checkpoint-25/tokenizer.json +3 -0
checkpoint-25/tokenizer_config.json +32 -0
checkpoint-25/trainer_state.json +475 -0
checkpoint-25/training_args.bin +3 -0
checkpoint-50/chat_template.jinja +54 -0
checkpoint-50/config.json +57 -0
checkpoint-50/generation_config.json +13 -0
checkpoint-50/model.safetensors +3 -0
checkpoint-50/optimizer.pt +3 -0
checkpoint-50/rng_state.pth +3 -0
checkpoint-50/scheduler.pt +3 -0
checkpoint-50/tokenizer.json +3 -0
checkpoint-50/tokenizer_config.json +32 -0
checkpoint-50/trainer_state.json +903 -0
checkpoint-50/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-25/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+base_model: Qwen/Qwen2.5-0.5B-Instruct
+library_name: transformers
+model_name: autonomy-agent-v2
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for autonomy-agent-v2
+This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 1.2.0
+- Transformers: 5.6.2
+- Pytorch: 2.11.0
+- Datasets: 4.8.4
+- Tokenizers: 0.22.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoint-25/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-25/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoint-25/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

checkpoint-25/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdf756fa7fcbe7404d5c60e26bff1a0c8b8aa1f72ced49e7dd0210fe288fb7fe
+size 988097824

checkpoint-25/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49070c7706b4d474935cf3ea7b357b2c28081a56fdb74d22bf333b8bc8bd6977
+size 1976378699

checkpoint-25/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c26fb843dab8e5657e4e5578986260ff3eab826cf10a8afcb7340750033c35f7
+size 14645

checkpoint-25/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a5b3364c61da829cfa47305bd0597607812e658ee7f7017c30727fef04f7e9d
+size 1465

checkpoint-25/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

checkpoint-25/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

checkpoint-25/trainer_state.json ADDED Viewed

	@@ -0,0 +1,475 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0625,
+  "eval_steps": 500,
+  "global_step": 25,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.975826621055603,
+      "epoch": 0.0025,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 1132.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 1,
+      "step_time": 5.940139313000145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.439243793487549,
+      "epoch": 0.005,
+      "grad_norm": 0.0,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 0.0,
+      "step": 2,
+      "step_time": 0.13269777900040935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.167536497116089,
+      "epoch": 0.0075,
+      "grad_norm": 0.0,
+      "learning_rate": 9.600000000000001e-06,
+      "loss": 0.0,
+      "step": 3,
+      "step_time": 0.12384193100024277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.270764112472534,
+      "epoch": 0.01,
+      "grad_norm": 0.0,
+      "learning_rate": 9.4e-06,
+      "loss": 0.0,
+      "step": 4,
+      "step_time": 0.12020948200006387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7375574111938477,
+      "epoch": 0.0125,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 9.200000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 2240.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 5,
+      "step_time": 5.4328740460005065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.2307178974151611,
+      "epoch": 0.015,
+      "grad_norm": 0.0,
+      "learning_rate": 9e-06,
+      "loss": 0.0,
+      "step": 6,
+      "step_time": 0.12100409000049694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.5965301990509033,
+      "epoch": 0.0175,
+      "grad_norm": 0.0,
+      "learning_rate": 8.8e-06,
+      "loss": 0.0,
+      "step": 7,
+      "step_time": 0.11992437099979725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.000967502593994,
+      "epoch": 0.02,
+      "grad_norm": 0.0,
+      "learning_rate": 8.6e-06,
+      "loss": 0.0,
+      "step": 8,
+      "step_time": 0.12026366800000687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1747303009033203,
+      "epoch": 0.0225,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 3372.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 9,
+      "step_time": 5.488657728999897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.2987818717956543,
+      "epoch": 0.025,
+      "grad_norm": 0.0,
+      "learning_rate": 8.2e-06,
+      "loss": 0.0,
+      "step": 10,
+      "step_time": 0.12148796400015272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.549589157104492,
+      "epoch": 0.0275,
+      "grad_norm": 0.0,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.0,
+      "step": 11,
+      "step_time": 0.1202768709999873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.3748786449432373,
+      "epoch": 0.03,
+      "grad_norm": 0.0,
+      "learning_rate": 7.800000000000002e-06,
+      "loss": 0.0,
+      "step": 12,
+      "step_time": 0.12146498200036149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.562595844268799,
+      "epoch": 0.0325,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 4504.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 13,
+      "step_time": 5.529226956000457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.9105567932128906,
+      "epoch": 0.035,
+      "grad_norm": 0.0,
+      "learning_rate": 7.4e-06,
+      "loss": 0.0,
+      "step": 14,
+      "step_time": 0.12003547699987394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.360198497772217,
+      "epoch": 0.0375,
+      "grad_norm": 0.0,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.0,
+      "step": 15,
+      "step_time": 0.12056965900046634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9906266927719116,
+      "epoch": 0.04,
+      "grad_norm": 0.0,
+      "learning_rate": 7e-06,
+      "loss": 0.0,
+      "step": 16,
+      "step_time": 0.11991975299952173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.4847490787506104,
+      "epoch": 0.0425,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 5636.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 17,
+      "step_time": 5.571549678000338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0457265377044678,
+      "epoch": 0.045,
+      "grad_norm": 0.0,
+      "learning_rate": 6.600000000000001e-06,
+      "loss": 0.0,
+      "step": 18,
+      "step_time": 0.12112131499998213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.4426045417785645,
+      "epoch": 0.0475,
+      "grad_norm": 0.0,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.0,
+      "step": 19,
+      "step_time": 0.12062360999971133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.8799967765808105,
+      "epoch": 0.05,
+      "grad_norm": 0.0,
+      "learning_rate": 6.200000000000001e-06,
+      "loss": 0.0,
+      "step": 20,
+      "step_time": 0.1196468329999334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.4007647037506104,
+      "epoch": 0.0525,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 6e-06,
+      "loss": 0.0,
+      "num_tokens": 6756.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 21,
+      "step_time": 5.46478837199993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0130887031555176,
+      "epoch": 0.055,
+      "grad_norm": 0.0,
+      "learning_rate": 5.8e-06,
+      "loss": 0.0,
+      "step": 22,
+      "step_time": 0.12104618500052311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.6849136352539062,
+      "epoch": 0.0575,
+      "grad_norm": 0.0,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.0,
+      "step": 23,
+      "step_time": 0.12017933200058906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.6699423789978027,
+      "epoch": 0.06,
+      "grad_norm": 0.0,
+      "learning_rate": 5.400000000000001e-06,
+      "loss": 0.0,
+      "step": 24,
+      "step_time": 0.11987412399957975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.353423595428467,
+      "epoch": 0.0625,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 5.2e-06,
+      "loss": 0.0,
+      "num_tokens": 7864.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 25,
+      "step_time": 5.475499750999916
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 7864,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-25/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d256cc6975c14fe69a6b3227efd5b2a62eb8616002cfbc058782540abc422f7
+size 7185

checkpoint-50/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-50/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoint-50/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.6.2"
+}

checkpoint-50/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdf756fa7fcbe7404d5c60e26bff1a0c8b8aa1f72ced49e7dd0210fe288fb7fe
+size 988097824

checkpoint-50/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:055b61eef78c73abcca4dbfdccb61a4163db9c61516f25f8ad7f58e671e85828
+size 1976378699

checkpoint-50/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c54cfd95c6cc3e287b93c5ac9469d6ee4290acf8ad1d1ae9c3bc8de0a3082b4a
+size 14645

checkpoint-50/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:976f2bafa7669d9a7187fa276b6a26ad9abd7bff1427e837484e3c6b2ab4eff4
+size 1465

checkpoint-50/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

checkpoint-50/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

checkpoint-50/trainer_state.json ADDED Viewed

	@@ -0,0 +1,903 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.125,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.975826621055603,
+      "epoch": 0.0025,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 1e-05,
+      "loss": 0.0,
+      "num_tokens": 1132.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 1,
+      "step_time": 5.940139313000145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.439243793487549,
+      "epoch": 0.005,
+      "grad_norm": 0.0,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 0.0,
+      "step": 2,
+      "step_time": 0.13269777900040935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.167536497116089,
+      "epoch": 0.0075,
+      "grad_norm": 0.0,
+      "learning_rate": 9.600000000000001e-06,
+      "loss": 0.0,
+      "step": 3,
+      "step_time": 0.12384193100024277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.270764112472534,
+      "epoch": 0.01,
+      "grad_norm": 0.0,
+      "learning_rate": 9.4e-06,
+      "loss": 0.0,
+      "step": 4,
+      "step_time": 0.12020948200006387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 1.7375574111938477,
+      "epoch": 0.0125,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 9.200000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 2240.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 5,
+      "step_time": 5.4328740460005065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.2307178974151611,
+      "epoch": 0.015,
+      "grad_norm": 0.0,
+      "learning_rate": 9e-06,
+      "loss": 0.0,
+      "step": 6,
+      "step_time": 0.12100409000049694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.5965301990509033,
+      "epoch": 0.0175,
+      "grad_norm": 0.0,
+      "learning_rate": 8.8e-06,
+      "loss": 0.0,
+      "step": 7,
+      "step_time": 0.11992437099979725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.000967502593994,
+      "epoch": 0.02,
+      "grad_norm": 0.0,
+      "learning_rate": 8.6e-06,
+      "loss": 0.0,
+      "step": 8,
+      "step_time": 0.12026366800000687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1747303009033203,
+      "epoch": 0.0225,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 3372.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 9,
+      "step_time": 5.488657728999897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.2987818717956543,
+      "epoch": 0.025,
+      "grad_norm": 0.0,
+      "learning_rate": 8.2e-06,
+      "loss": 0.0,
+      "step": 10,
+      "step_time": 0.12148796400015272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.549589157104492,
+      "epoch": 0.0275,
+      "grad_norm": 0.0,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.0,
+      "step": 11,
+      "step_time": 0.1202768709999873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.3748786449432373,
+      "epoch": 0.03,
+      "grad_norm": 0.0,
+      "learning_rate": 7.800000000000002e-06,
+      "loss": 0.0,
+      "step": 12,
+      "step_time": 0.12146498200036149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.562595844268799,
+      "epoch": 0.0325,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 4504.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 13,
+      "step_time": 5.529226956000457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.9105567932128906,
+      "epoch": 0.035,
+      "grad_norm": 0.0,
+      "learning_rate": 7.4e-06,
+      "loss": 0.0,
+      "step": 14,
+      "step_time": 0.12003547699987394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.360198497772217,
+      "epoch": 0.0375,
+      "grad_norm": 0.0,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.0,
+      "step": 15,
+      "step_time": 0.12056965900046634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9906266927719116,
+      "epoch": 0.04,
+      "grad_norm": 0.0,
+      "learning_rate": 7e-06,
+      "loss": 0.0,
+      "step": 16,
+      "step_time": 0.11991975299952173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.4847490787506104,
+      "epoch": 0.0425,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.0,
+      "num_tokens": 5636.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 17,
+      "step_time": 5.571549678000338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0457265377044678,
+      "epoch": 0.045,
+      "grad_norm": 0.0,
+      "learning_rate": 6.600000000000001e-06,
+      "loss": 0.0,
+      "step": 18,
+      "step_time": 0.12112131499998213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.4426045417785645,
+      "epoch": 0.0475,
+      "grad_norm": 0.0,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.0,
+      "step": 19,
+      "step_time": 0.12062360999971133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.8799967765808105,
+      "epoch": 0.05,
+      "grad_norm": 0.0,
+      "learning_rate": 6.200000000000001e-06,
+      "loss": 0.0,
+      "step": 20,
+      "step_time": 0.1196468329999334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.4007647037506104,
+      "epoch": 0.0525,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 6e-06,
+      "loss": 0.0,
+      "num_tokens": 6756.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 21,
+      "step_time": 5.46478837199993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.0130887031555176,
+      "epoch": 0.055,
+      "grad_norm": 0.0,
+      "learning_rate": 5.8e-06,
+      "loss": 0.0,
+      "step": 22,
+      "step_time": 0.12104618500052311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.6849136352539062,
+      "epoch": 0.0575,
+      "grad_norm": 0.0,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.0,
+      "step": 23,
+      "step_time": 0.12017933200058906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.6699423789978027,
+      "epoch": 0.06,
+      "grad_norm": 0.0,
+      "learning_rate": 5.400000000000001e-06,
+      "loss": 0.0,
+      "step": 24,
+      "step_time": 0.11987412399957975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.353423595428467,
+      "epoch": 0.0625,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 5.2e-06,
+      "loss": 0.0,
+      "num_tokens": 7864.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 25,
+      "step_time": 5.475499750999916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.6273622512817383,
+      "epoch": 0.065,
+      "grad_norm": 0.0,
+      "learning_rate": 5e-06,
+      "loss": 0.0,
+      "step": 26,
+      "step_time": 0.12144228599936469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.7559497356414795,
+      "epoch": 0.0675,
+      "grad_norm": 0.0,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.0,
+      "step": 27,
+      "step_time": 0.12019816300016828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.4082813262939453,
+      "epoch": 0.07,
+      "grad_norm": 0.0,
+      "learning_rate": 4.600000000000001e-06,
+      "loss": 0.0,
+      "step": 28,
+      "step_time": 0.12150602700057789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.0362539291381836,
+      "epoch": 0.0725,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 4.4e-06,
+      "loss": 0.0,
+      "num_tokens": 8972.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 29,
+      "step_time": 5.578138597999896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9473274946212769,
+      "epoch": 0.075,
+      "grad_norm": 0.0,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": 0.0,
+      "step": 30,
+      "step_time": 0.11901681000017561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.764465570449829,
+      "epoch": 0.0775,
+      "grad_norm": 0.0,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.0,
+      "step": 31,
+      "step_time": 0.11898399900019285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9196826219558716,
+      "epoch": 0.08,
+      "grad_norm": 0.0,
+      "learning_rate": 3.8000000000000005e-06,
+      "loss": 0.0,
+      "step": 32,
+      "step_time": 0.12208616299994901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.1861605644226074,
+      "epoch": 0.0825,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 3.6000000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 10104.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 33,
+      "step_time": 5.526427269999658
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9254075288772583,
+      "epoch": 0.085,
+      "grad_norm": 0.0,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 0.0,
+      "step": 34,
+      "step_time": 0.11869688299975678
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.2586331367492676,
+      "epoch": 0.0875,
+      "grad_norm": 0.0,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 0.0,
+      "step": 35,
+      "step_time": 0.11914809700010665
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.4637088775634766,
+      "epoch": 0.09,
+      "grad_norm": 0.0,
+      "learning_rate": 3e-06,
+      "loss": 0.0,
+      "step": 36,
+      "step_time": 0.1263795000004393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 3.3892905712127686,
+      "epoch": 0.0925,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 11212.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 37,
+      "step_time": 5.478073480000603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.1603095531463623,
+      "epoch": 0.095,
+      "grad_norm": 0.0,
+      "learning_rate": 2.6e-06,
+      "loss": 0.0,
+      "step": 38,
+      "step_time": 0.12008928099930927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.8118975162506104,
+      "epoch": 0.0975,
+      "grad_norm": 0.0,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.0,
+      "step": 39,
+      "step_time": 0.11953222700049082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.9064228534698486,
+      "epoch": 0.1,
+      "grad_norm": 0.0,
+      "learning_rate": 2.2e-06,
+      "loss": 0.0,
+      "step": 40,
+      "step_time": 0.11825968199991621
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.5093579292297363,
+      "epoch": 0.1025,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 12320.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 41,
+      "step_time": 5.509690339999906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.3179931640625,
+      "epoch": 0.105,
+      "grad_norm": 0.0,
+      "learning_rate": 1.8000000000000001e-06,
+      "loss": 0.0,
+      "step": 42,
+      "step_time": 0.12237605300015275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.4812008142471313,
+      "epoch": 0.1075,
+      "grad_norm": 0.0,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 0.0,
+      "step": 43,
+      "step_time": 0.12001185999997688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 4.90458869934082,
+      "epoch": 0.11,
+      "grad_norm": 0.0,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 0.0,
+      "step": 44,
+      "step_time": 0.11975958299990452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.2333452701568604,
+      "epoch": 0.1125,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 13452.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 45,
+      "step_time": 5.512979769000594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 1.8680591583251953,
+      "epoch": 0.115,
+      "grad_norm": 0.0,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.0,
+      "step": 46,
+      "step_time": 0.12362773699987883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 3.232543706893921,
+      "epoch": 0.1175,
+      "grad_norm": 0.0,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 0.0,
+      "step": 47,
+      "step_time": 0.12175974799993128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.694847583770752,
+      "epoch": 0.12,
+      "grad_norm": 0.0,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.0,
+      "step": 48,
+      "step_time": 0.11931696000010561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 256.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 2.182255268096924,
+      "epoch": 0.1225,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.0,
+      "num_tokens": 14572.0,
+      "reward": 0.009999999776482582,
+      "reward_std": 0.0,
+      "rewards/autonomy_reward_fn/mean": 0.009999999776482582,
+      "rewards/autonomy_reward_fn/std": 0.0,
+      "step": 49,
+      "step_time": 5.534827849000067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "entropy": 2.535165309906006,
+      "epoch": 0.125,
+      "grad_norm": 0.0,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 0.0,
+      "step": 50,
+      "step_time": 0.12092450500040286
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 50,
+  "num_input_tokens_seen": 14572,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-50/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d256cc6975c14fe69a6b3227efd5b2a62eb8616002cfbc058782540abc422f7
+size 7185