Jatin997 commited on 28 days ago

Commit

4696fbc

verified ·

1 Parent(s): 800e929

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

.gitattributes +1 -0
README.md +2 -2
checkpoint-64/chat_template.jinja +89 -0
checkpoint-64/config.json +63 -0
checkpoint-64/generation_config.json +12 -0
checkpoint-64/model.safetensors +3 -0
checkpoint-64/optimizer.pt +3 -0
checkpoint-64/rng_state.pth +3 -0
checkpoint-64/scheduler.pt +3 -0
checkpoint-64/tokenizer.json +3 -0
checkpoint-64/tokenizer_config.json +75 -0
checkpoint-64/trainer_state.json +208 -0
checkpoint-64/training_args.bin +3 -0
completions/completions_00010.parquet +2 -2
completions/completions_00020.parquet +3 -0
completions/completions_00030.parquet +3 -0
completions/completions_00040.parquet +3 -0
completions/completions_00050.parquet +3 -0
completions/completions_00060.parquet +3 -0
completions/completions_00064.parquet +3 -0
metrics/loss_curve.svg +4 -4
metrics/loss_history.json +21 -1
metrics/reward_curve.svg +4 -4
metrics/reward_history.json +23 -3
run_manifest.json +4 -4
trainer_state.json +192 -47

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 checkpoint-12/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 checkpoint-12/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-64/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 base_model: Qwen/Qwen3-0.6B
 library_name: transformers
-model_name: pulse_er_grpo_final
 tags:
 - generated_from_trainer
 - hf_jobs
@@ -10,7 +10,7 @@ tags:
 licence: license
 ---
-# Model Card for pulse_er_grpo_final
 This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B).
 It has been trained using [TRL](https://github.com/huggingface/trl).

 ---
 base_model: Qwen/Qwen3-0.6B
 library_name: transformers
+model_name: pulse_er_grpo_curve_v2
 tags:
 - generated_from_trainer
 - hf_jobs
 licence: license
 ---
+# Model Card for pulse_er_grpo_curve_v2
 This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B).
 It has been trained using [TRL](https://github.com/huggingface/trl).

checkpoint-64/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-64/config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.6.2",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoint-64/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.6.2"
+}

checkpoint-64/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9daeb3bbd2ebb9b4a9910781ba136204aaeef6c683148c50637f29a347a9b51e
+size 2384234968

checkpoint-64/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae22a58b918c4e55813693f56a60d74d3036e2924f45a5f20888c6e0c8074a2f
+size 4768664614

checkpoint-64/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b95731ce312b6d3f623d553049d545337c5d22f01090116e149fd3cf089643e
+size 14244

checkpoint-64/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:062d9669d1ff2893787af221dc1a10f378ff916c64725933b3d39e4de4dd1029
+size 1064

checkpoint-64/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoint-64/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "reasoning_content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object",
+          "x-parser": "json",
+          "x-parser-args": {
+            "transform": "{type: 'function', function: @}"
+          }
+        },
+        "type": "array",
+        "x-regex-iterator": "<tool_call>\\s*(.+?)\\s*</tool_call>"
+      }
+    },
+    "type": "object",
+    "x-regex": "^(?:<think>\\n?(?:(?P<reasoning_content>.*?\\S.*?)\\n?|[\\s]*)</think>\\s*)?(?P<content>.*?)(?:\\n(?=<tool_call>))?(?=(?:<tool_call>|<\\|im_end\\|>|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\\s*)+)?\\s*(?:<\\|im_end\\|>|$)"
+  },
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

checkpoint-64/trainer_state.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 64,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 281.7,
+      "completions/max_terminated_length": 281.7,
+      "completions/mean_length": 173.90625,
+      "completions/mean_terminated_length": 173.90625,
+      "completions/min_length": 60.1,
+      "completions/min_terminated_length": 60.1,
+      "entropy": 0.34981602281332014,
+      "epoch": 0.3125,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 3.5286595821380615,
+      "learning_rate": 8.593749999999999e-07,
+      "loss": -0.03454443216323853,
+      "num_tokens": 363490.0,
+      "reward": -1.9279687214642762,
+      "reward_std": 6.467041900753975,
+      "rewards/pulse_reward/mean": -1.9279687214642762,
+      "rewards/pulse_reward/std": 6.467041908204555,
+      "step": 10,
+      "step_time": 35.20241980789724,
+      "tools/call_frequency": 5.49375,
+      "tools/failure_frequency": 0.019680690905079245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 294.3,
+      "completions/max_terminated_length": 294.3,
+      "completions/mean_length": 172.56875,
+      "completions/mean_terminated_length": 172.56875,
+      "completions/min_length": 50.3,
+      "completions/min_terminated_length": 50.3,
+      "entropy": 0.23517516404390335,
+      "epoch": 0.625,
+      "frac_reward_zero_std": 0.2625,
+      "grad_norm": 3.8192903995513916,
+      "learning_rate": 7.031249999999999e-07,
+      "loss": 0.01356593519449234,
+      "num_tokens": 726552.0,
+      "reward": 0.09853125289082527,
+      "reward_std": 0.583341383934021,
+      "rewards/pulse_reward/mean": 0.09853125289082527,
+      "rewards/pulse_reward/std": 0.583341383934021,
+      "step": 20,
+      "step_time": 39.57589099079487,
+      "tools/call_frequency": 6.35,
+      "tools/failure_frequency": 0.005996426660567522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 273.0,
+      "completions/max_terminated_length": 273.0,
+      "completions/mean_length": 149.115625,
+      "completions/mean_terminated_length": 149.115625,
+      "completions/min_length": 50.9,
+      "completions/min_terminated_length": 50.9,
+      "entropy": 0.1797773003578186,
+      "epoch": 0.9375,
+      "frac_reward_zero_std": 0.0875,
+      "grad_norm": 5.692322254180908,
+      "learning_rate": 5.46875e-07,
+      "loss": 0.06958286762237549,
+      "num_tokens": 1082109.0,
+      "reward": 0.3879156395792961,
+      "reward_std": 0.6661841243505477,
+      "rewards/pulse_reward/mean": 0.3879156395792961,
+      "rewards/pulse_reward/std": 0.6661841526627541,
+      "step": 30,
+      "step_time": 30.443527194001945,
+      "tools/call_frequency": 5.58125,
+      "tools/failure_frequency": 0.005842319130897522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 115.3,
+      "completions/max_terminated_length": 115.3,
+      "completions/mean_length": 70.528125,
+      "completions/mean_terminated_length": 70.528125,
+      "completions/min_length": 38.1,
+      "completions/min_terminated_length": 38.1,
+      "entropy": 0.1547975329682231,
+      "epoch": 1.25,
+      "frac_reward_zero_std": 0.1625,
+      "grad_norm": 3.677471399307251,
+      "learning_rate": 3.9062499999999997e-07,
+      "loss": 0.02682075500488281,
+      "num_tokens": 1412518.0,
+      "reward": 1.0730250239372254,
+      "reward_std": 0.7086882412433624,
+      "rewards/pulse_reward/mean": 1.0730250239372254,
+      "rewards/pulse_reward/std": 0.7086882352828979,
+      "step": 40,
+      "step_time": 11.361506971600466,
+      "tools/call_frequency": 2.215625,
+      "tools/failure_frequency": 0.0015384615398943424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 83.1,
+      "completions/max_terminated_length": 83.1,
+      "completions/mean_length": 65.421875,
+      "completions/mean_terminated_length": 65.421875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 0.12452723067253828,
+      "epoch": 1.5625,
+      "frac_reward_zero_std": 0.6625,
+      "grad_norm": 3.609057664871216,
+      "learning_rate": 2.3437499999999998e-07,
+      "loss": -3.346521407365799e-05,
+      "num_tokens": 1741293.0,
+      "reward": 1.4852312803268433,
+      "reward_std": 0.3715872406959534,
+      "rewards/pulse_reward/mean": 1.4852312803268433,
+      "rewards/pulse_reward/std": 0.3715872406959534,
+      "step": 50,
+      "step_time": 8.49522676919878,
+      "tools/call_frequency": 2.0,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 86.6,
+      "completions/max_terminated_length": 86.6,
+      "completions/mean_length": 65.471875,
+      "completions/mean_terminated_length": 65.471875,
+      "completions/min_length": 53.6,
+      "completions/min_terminated_length": 53.6,
+      "entropy": 0.11676975060254335,
+      "epoch": 1.875,
+      "frac_reward_zero_std": 0.8125,
+      "grad_norm": 2.142204761505127,
+      "learning_rate": 7.812499999999999e-08,
+      "loss": -0.00026753861457109453,
+      "num_tokens": 2070084.0,
+      "reward": 1.538143789768219,
+      "reward_std": 0.2553505107760429,
+      "rewards/pulse_reward/mean": 1.538143789768219,
+      "rewards/pulse_reward/std": 0.25535051375627515,
+      "step": 60,
+      "step_time": 8.590878555196104,
+      "tools/call_frequency": 1.996875,
+      "tools/failure_frequency": 0.0
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 64,
+  "num_input_tokens_seen": 2201581,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-64/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beef4bb9265600265ee78a20bc82f4606af28d145e10bb1b99f8c1c25b2dfef3
+size 6776

completions/completions_00010.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10c0189e54b95efcd547bc7428c397934ae26e1dcfff7b8224d763f5d309bf56
-size 27095

 version https://git-lfs.github.com/spec/v1
+oid sha256:f3ed1e88b120607f7b8cbc21e6d009da8f473f089b81b63a4cc6d2e30f33eda3
+size 31937

completions/completions_00020.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67b0276af96688b86a5d64aa8e3321028609a4197484f003ed3731c25c153538
+size 33667

completions/completions_00030.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1499c8f1a195fe8930fd7338a09d90d9f5551f604fcfdab147f034f0ecc71827
+size 28958

completions/completions_00040.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:646c066b43013c281723f0d4c62facd9a369b7025636ba5658a350bab1a5180e
+size 23850

completions/completions_00050.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d48aad7bd937639532ebe9dba92c318f0ad86db035761861204516149084a29a
+size 23846

completions/completions_00060.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be67ebb6ca105cca7c9f08e11e8259c50ea4bed9b1a3a00a70f000e14ed38c6d
+size 23589

completions/completions_00064.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad2757babd27c9411393c4809b69ca6ee606012f352645a97006503671aa6bcb
+size 23756

metrics/loss_curve.svg CHANGED Viewed

metrics/loss_history.json CHANGED Viewed

@@ -1,6 +1,26 @@
 [
   {
     "step": 10.0,
-    "value": -0.015362872183322907
   }
 ]

 [
   {
     "step": 10.0,
+    "value": -0.03454443216323853
+  },
+  {
+    "step": 20.0,
+    "value": 0.01356593519449234
+  },
+  {
+    "step": 30.0,
+    "value": 0.06958286762237549
+  },
+  {
+    "step": 40.0,
+    "value": 0.02682075500488281
+  },
+  {
+    "step": 50.0,
+    "value": -3.346521407365799e-05
+  },
+  {
+    "step": 60.0,
+    "value": -0.00026753861457109453
   }
 ]

metrics/reward_curve.svg CHANGED Viewed

metrics/reward_history.json CHANGED Viewed

@@ -1,10 +1,30 @@
 [
   {
     "step": 10.0,
-    "value": -1.56496252566576
   },
   {
-    "step": 12.0,
-    "value": -1.5552499741315842
   }
 ]

 [
   {
     "step": 10.0,
+    "value": -1.9279687214642762
   },
   {
+    "step": 20.0,
+    "value": 0.09853125289082527
+  },
+  {
+    "step": 30.0,
+    "value": 0.3879156395792961
+  },
+  {
+    "step": 40.0,
+    "value": 1.0730250239372254
+  },
+  {
+    "step": 50.0,
+    "value": 1.4852312803268433
+  },
+  {
+    "step": 60.0,
+    "value": 1.538143789768219
+  },
+  {
+    "step": 64.0,
+    "value": 1.57924222946167
   }
 ]

run_manifest.json CHANGED Viewed

@@ -4,14 +4,14 @@
   "env_url": "http://127.0.0.1:8000",
   "fp16": false,
   "git_commit": "3b3d65241147f5ac61616a235e860943f956b9af",
-  "gradient_accumulation_steps": 2,
   "learning_rate": 1e-06,
   "max_steps": 1024,
   "model": "Qwen/Qwen3-0.6B",
   "num_generations": 4,
-  "num_samples": 24,
-  "num_train_epochs": 1.0,
-  "per_device_train_batch_size": 4,
   "scenario_id": "respiratory_distress",
   "seed": 42,
   "use_cpu": false

   "env_url": "http://127.0.0.1:8000",
   "fp16": false,
   "git_commit": "3b3d65241147f5ac61616a235e860943f956b9af",
+  "gradient_accumulation_steps": 4,
   "learning_rate": 1e-06,
   "max_steps": 1024,
   "model": "Qwen/Qwen3-0.6B",
   "num_generations": 4,
+  "num_samples": 256,
+  "num_train_epochs": 2.0,
+  "per_device_train_batch_size": 8,
   "scenario_id": "respiratory_distress",
   "seed": 42,
   "use_cpu": false

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 12,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -16,27 +16,27 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 233.8,
-      "completions/max_terminated_length": 233.8,
-      "completions/mean_length": 170.275,
-      "completions/mean_terminated_length": 170.275,
-      "completions/min_length": 88.3,
-      "completions/min_terminated_length": 88.3,
-      "entropy": 0.40836348086595536,
-      "epoch": 0.8333333333333334,
-      "frac_reward_zero_std": 0.15,
-      "grad_norm": 7.111741542816162,
-      "learning_rate": 2.5e-07,
-      "loss": -0.015362872183322907,
-      "num_tokens": 90582.0,
-      "reward": -1.56496252566576,
-      "reward_std": 4.461238273605704,
-      "rewards/pulse_reward/mean": -1.56496252566576,
-      "rewards/pulse_reward/std": 4.461238479241729,
       "step": 10,
-      "step_time": 18.493348840194813,
-      "tools/call_frequency": 4.75,
-      "tools/failure_frequency": 0.006923890113830567
     },
     {
       "clip_ratio/high_max": 0.0,
@@ -45,35 +45,180 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 260.0,
-      "completions/max_terminated_length": 260.0,
-      "completions/mean_length": 166.625,
-      "completions/mean_terminated_length": 166.625,
-      "completions/min_length": 87.0,
-      "completions/min_terminated_length": 87.0,
-      "entropy": 0.47208209335803986,
-      "epoch": 1.0,
-      "frac_reward_zero_std": 0.0,
-      "num_tokens": 108640.0,
-      "reward": -1.5552499741315842,
-      "reward_std": 5.09348089993,
-      "rewards/pulse_reward/mean": -1.5552499741315842,
-      "rewards/pulse_reward/std": 5.093480907380581,
-      "step": 12,
-      "step_time": 25.044593099497433,
-      "tools/call_frequency": 4.0,
       "tools/failure_frequency": 0.0,
       "total_flos": 0.0,
-      "train_loss": -0.013816780100266138,
-      "train_runtime": 244.674,
-      "train_samples_per_second": 0.098,
-      "train_steps_per_second": 0.049
     }
   ],
   "logging_steps": 10,
-  "max_steps": 12,
-  "num_input_tokens_seen": 108640,
-  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -88,7 +233,7 @@
     }
   },
   "total_flos": 0.0,
-  "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null
 }

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.0,
   "eval_steps": 500,
+  "global_step": 64,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 281.7,
+      "completions/max_terminated_length": 281.7,
+      "completions/mean_length": 173.90625,
+      "completions/mean_terminated_length": 173.90625,
+      "completions/min_length": 60.1,
+      "completions/min_terminated_length": 60.1,
+      "entropy": 0.34981602281332014,
+      "epoch": 0.3125,
+      "frac_reward_zero_std": 0.2,
+      "grad_norm": 3.5286595821380615,
+      "learning_rate": 8.593749999999999e-07,
+      "loss": -0.03454443216323853,
+      "num_tokens": 363490.0,
+      "reward": -1.9279687214642762,
+      "reward_std": 6.467041900753975,
+      "rewards/pulse_reward/mean": -1.9279687214642762,
+      "rewards/pulse_reward/std": 6.467041908204555,
       "step": 10,
+      "step_time": 35.20241980789724,
+      "tools/call_frequency": 5.49375,
+      "tools/failure_frequency": 0.019680690905079245
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 294.3,
+      "completions/max_terminated_length": 294.3,
+      "completions/mean_length": 172.56875,
+      "completions/mean_terminated_length": 172.56875,
+      "completions/min_length": 50.3,
+      "completions/min_terminated_length": 50.3,
+      "entropy": 0.23517516404390335,
+      "epoch": 0.625,
+      "frac_reward_zero_std": 0.2625,
+      "grad_norm": 3.8192903995513916,
+      "learning_rate": 7.031249999999999e-07,
+      "loss": 0.01356593519449234,
+      "num_tokens": 726552.0,
+      "reward": 0.09853125289082527,
+      "reward_std": 0.583341383934021,
+      "rewards/pulse_reward/mean": 0.09853125289082527,
+      "rewards/pulse_reward/std": 0.583341383934021,
+      "step": 20,
+      "step_time": 39.57589099079487,
+      "tools/call_frequency": 6.35,
+      "tools/failure_frequency": 0.005996426660567522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 273.0,
+      "completions/max_terminated_length": 273.0,
+      "completions/mean_length": 149.115625,
+      "completions/mean_terminated_length": 149.115625,
+      "completions/min_length": 50.9,
+      "completions/min_terminated_length": 50.9,
+      "entropy": 0.1797773003578186,
+      "epoch": 0.9375,
+      "frac_reward_zero_std": 0.0875,
+      "grad_norm": 5.692322254180908,
+      "learning_rate": 5.46875e-07,
+      "loss": 0.06958286762237549,
+      "num_tokens": 1082109.0,
+      "reward": 0.3879156395792961,
+      "reward_std": 0.6661841243505477,
+      "rewards/pulse_reward/mean": 0.3879156395792961,
+      "rewards/pulse_reward/std": 0.6661841526627541,
+      "step": 30,
+      "step_time": 30.443527194001945,
+      "tools/call_frequency": 5.58125,
+      "tools/failure_frequency": 0.005842319130897522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 115.3,
+      "completions/max_terminated_length": 115.3,
+      "completions/mean_length": 70.528125,
+      "completions/mean_terminated_length": 70.528125,
+      "completions/min_length": 38.1,
+      "completions/min_terminated_length": 38.1,
+      "entropy": 0.1547975329682231,
+      "epoch": 1.25,
+      "frac_reward_zero_std": 0.1625,
+      "grad_norm": 3.677471399307251,
+      "learning_rate": 3.9062499999999997e-07,
+      "loss": 0.02682075500488281,
+      "num_tokens": 1412518.0,
+      "reward": 1.0730250239372254,
+      "reward_std": 0.7086882412433624,
+      "rewards/pulse_reward/mean": 1.0730250239372254,
+      "rewards/pulse_reward/std": 0.7086882352828979,
+      "step": 40,
+      "step_time": 11.361506971600466,
+      "tools/call_frequency": 2.215625,
+      "tools/failure_frequency": 0.0015384615398943424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 83.1,
+      "completions/max_terminated_length": 83.1,
+      "completions/mean_length": 65.421875,
+      "completions/mean_terminated_length": 65.421875,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 0.12452723067253828,
+      "epoch": 1.5625,
+      "frac_reward_zero_std": 0.6625,
+      "grad_norm": 3.609057664871216,
+      "learning_rate": 2.3437499999999998e-07,
+      "loss": -3.346521407365799e-05,
+      "num_tokens": 1741293.0,
+      "reward": 1.4852312803268433,
+      "reward_std": 0.3715872406959534,
+      "rewards/pulse_reward/mean": 1.4852312803268433,
+      "rewards/pulse_reward/std": 0.3715872406959534,
+      "step": 50,
+      "step_time": 8.49522676919878,
+      "tools/call_frequency": 2.0,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 86.6,
+      "completions/max_terminated_length": 86.6,
+      "completions/mean_length": 65.471875,
+      "completions/mean_terminated_length": 65.471875,
+      "completions/min_length": 53.6,
+      "completions/min_terminated_length": 53.6,
+      "entropy": 0.11676975060254335,
+      "epoch": 1.875,
+      "frac_reward_zero_std": 0.8125,
+      "grad_norm": 2.142204761505127,
+      "learning_rate": 7.812499999999999e-08,
+      "loss": -0.00026753861457109453,
+      "num_tokens": 2070084.0,
+      "reward": 1.538143789768219,
+      "reward_std": 0.2553505107760429,
+      "rewards/pulse_reward/mean": 1.538143789768219,
+      "rewards/pulse_reward/std": 0.25535051375627515,
+      "step": 60,
+      "step_time": 8.590878555196104,
+      "tools/call_frequency": 1.996875,
+      "tools/failure_frequency": 0.0
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 74.75,
+      "completions/max_terminated_length": 74.75,
+      "completions/mean_length": 65.3203125,
+      "completions/mean_terminated_length": 65.3203125,
+      "completions/min_length": 61.25,
+      "completions/min_terminated_length": 61.25,
+      "entropy": 0.11779927462339401,
+      "epoch": 2.0,
+      "frac_reward_zero_std": 0.9375,
+      "num_tokens": 2201581.0,
+      "reward": 1.57924222946167,
+      "reward_std": 0.11742392182350159,
+      "rewards/pulse_reward/mean": 1.57924222946167,
+      "rewards/pulse_reward/std": 0.11742392182350159,
+      "step": 64,
+      "step_time": 7.905290340990177,
+      "tools/call_frequency": 2.0078125,
       "tools/failure_frequency": 0.0,
       "total_flos": 0.0,
+      "train_loss": 0.011853500996949151,
+      "train_runtime": 1404.4837,
+      "train_samples_per_second": 0.365,
+      "train_steps_per_second": 0.046
     }
   ],
   "logging_steps": 10,
+  "max_steps": 64,
+  "num_input_tokens_seen": 2201581,
+  "num_train_epochs": 2,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
     }
   },
   "total_flos": 0.0,
+  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }