Jatin997 commited on
Commit
4696fbc
·
verified ·
1 Parent(s): 800e929

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoint-12/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoint-12/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-64/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  base_model: Qwen/Qwen3-0.6B
3
  library_name: transformers
4
- model_name: pulse_er_grpo_final
5
  tags:
6
  - generated_from_trainer
7
  - hf_jobs
@@ -10,7 +10,7 @@ tags:
10
  licence: license
11
  ---
12
 
13
- # Model Card for pulse_er_grpo_final
14
 
15
  This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B).
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
 
1
  ---
2
  base_model: Qwen/Qwen3-0.6B
3
  library_name: transformers
4
+ model_name: pulse_er_grpo_curve_v2
5
  tags:
6
  - generated_from_trainer
7
  - hf_jobs
 
10
  licence: license
11
  ---
12
 
13
+ # Model Card for pulse_er_grpo_curve_v2
14
 
15
  This model is a fine-tuned version of [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B).
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
checkpoint-64/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-64/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 151643,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": true,
59
+ "transformers_version": "5.6.2",
60
+ "use_cache": false,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 151936
63
+ }
checkpoint-64/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.6,
9
+ "top_k": 20,
10
+ "top_p": 0.95,
11
+ "transformers_version": "5.6.2"
12
+ }
checkpoint-64/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9daeb3bbd2ebb9b4a9910781ba136204aaeef6c683148c50637f29a347a9b51e
3
+ size 2384234968
checkpoint-64/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae22a58b918c4e55813693f56a60d74d3036e2924f45a5f20888c6e0c8074a2f
3
+ size 4768664614
checkpoint-64/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b95731ce312b6d3f623d553049d545337c5d22f01090116e149fd3cf089643e
3
+ size 14244
checkpoint-64/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062d9669d1ff2893787af221dc1a10f378ff916c64725933b3d39e4de4dd1029
3
+ size 1064
checkpoint-64/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoint-64/tokenizer_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "padding_side": "left",
28
+ "response_schema": {
29
+ "properties": {
30
+ "content": {
31
+ "type": "string"
32
+ },
33
+ "reasoning_content": {
34
+ "type": "string"
35
+ },
36
+ "role": {
37
+ "const": "assistant"
38
+ },
39
+ "tool_calls": {
40
+ "items": {
41
+ "properties": {
42
+ "function": {
43
+ "properties": {
44
+ "arguments": {
45
+ "additionalProperties": {},
46
+ "type": "object"
47
+ },
48
+ "name": {
49
+ "type": "string"
50
+ }
51
+ },
52
+ "type": "object"
53
+ },
54
+ "type": {
55
+ "const": "function"
56
+ }
57
+ },
58
+ "type": "object",
59
+ "x-parser": "json",
60
+ "x-parser-args": {
61
+ "transform": "{type: 'function', function: @}"
62
+ }
63
+ },
64
+ "type": "array",
65
+ "x-regex-iterator": "<tool_call>\\s*(.+?)\\s*</tool_call>"
66
+ }
67
+ },
68
+ "type": "object",
69
+ "x-regex": "^(?:<think>\\n?(?:(?P<reasoning_content>.*?\\S.*?)\\n?|[\\s]*)</think>\\s*)?(?P<content>.*?)(?:\\n(?=<tool_call>))?(?=(?:<tool_call>|<\\|im_end\\|>|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\\s*)+)?\\s*(?:<\\|im_end\\|>|$)"
70
+ },
71
+ "split_special_tokens": false,
72
+ "tokenizer_class": "Qwen2Tokenizer",
73
+ "truncation_side": "left",
74
+ "unk_token": null
75
+ }
checkpoint-64/trainer_state.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 64,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 281.7,
20
+ "completions/max_terminated_length": 281.7,
21
+ "completions/mean_length": 173.90625,
22
+ "completions/mean_terminated_length": 173.90625,
23
+ "completions/min_length": 60.1,
24
+ "completions/min_terminated_length": 60.1,
25
+ "entropy": 0.34981602281332014,
26
+ "epoch": 0.3125,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 3.5286595821380615,
29
+ "learning_rate": 8.593749999999999e-07,
30
+ "loss": -0.03454443216323853,
31
+ "num_tokens": 363490.0,
32
+ "reward": -1.9279687214642762,
33
+ "reward_std": 6.467041900753975,
34
+ "rewards/pulse_reward/mean": -1.9279687214642762,
35
+ "rewards/pulse_reward/std": 6.467041908204555,
36
+ "step": 10,
37
+ "step_time": 35.20241980789724,
38
+ "tools/call_frequency": 5.49375,
39
+ "tools/failure_frequency": 0.019680690905079245
40
+ },
41
+ {
42
+ "clip_ratio/high_max": 0.0,
43
+ "clip_ratio/high_mean": 0.0,
44
+ "clip_ratio/low_mean": 0.0,
45
+ "clip_ratio/low_min": 0.0,
46
+ "clip_ratio/region_mean": 0.0,
47
+ "completions/clipped_ratio": 0.0,
48
+ "completions/max_length": 294.3,
49
+ "completions/max_terminated_length": 294.3,
50
+ "completions/mean_length": 172.56875,
51
+ "completions/mean_terminated_length": 172.56875,
52
+ "completions/min_length": 50.3,
53
+ "completions/min_terminated_length": 50.3,
54
+ "entropy": 0.23517516404390335,
55
+ "epoch": 0.625,
56
+ "frac_reward_zero_std": 0.2625,
57
+ "grad_norm": 3.8192903995513916,
58
+ "learning_rate": 7.031249999999999e-07,
59
+ "loss": 0.01356593519449234,
60
+ "num_tokens": 726552.0,
61
+ "reward": 0.09853125289082527,
62
+ "reward_std": 0.583341383934021,
63
+ "rewards/pulse_reward/mean": 0.09853125289082527,
64
+ "rewards/pulse_reward/std": 0.583341383934021,
65
+ "step": 20,
66
+ "step_time": 39.57589099079487,
67
+ "tools/call_frequency": 6.35,
68
+ "tools/failure_frequency": 0.005996426660567522
69
+ },
70
+ {
71
+ "clip_ratio/high_max": 0.0,
72
+ "clip_ratio/high_mean": 0.0,
73
+ "clip_ratio/low_mean": 0.0,
74
+ "clip_ratio/low_min": 0.0,
75
+ "clip_ratio/region_mean": 0.0,
76
+ "completions/clipped_ratio": 0.0,
77
+ "completions/max_length": 273.0,
78
+ "completions/max_terminated_length": 273.0,
79
+ "completions/mean_length": 149.115625,
80
+ "completions/mean_terminated_length": 149.115625,
81
+ "completions/min_length": 50.9,
82
+ "completions/min_terminated_length": 50.9,
83
+ "entropy": 0.1797773003578186,
84
+ "epoch": 0.9375,
85
+ "frac_reward_zero_std": 0.0875,
86
+ "grad_norm": 5.692322254180908,
87
+ "learning_rate": 5.46875e-07,
88
+ "loss": 0.06958286762237549,
89
+ "num_tokens": 1082109.0,
90
+ "reward": 0.3879156395792961,
91
+ "reward_std": 0.6661841243505477,
92
+ "rewards/pulse_reward/mean": 0.3879156395792961,
93
+ "rewards/pulse_reward/std": 0.6661841526627541,
94
+ "step": 30,
95
+ "step_time": 30.443527194001945,
96
+ "tools/call_frequency": 5.58125,
97
+ "tools/failure_frequency": 0.005842319130897522
98
+ },
99
+ {
100
+ "clip_ratio/high_max": 0.0,
101
+ "clip_ratio/high_mean": 0.0,
102
+ "clip_ratio/low_mean": 0.0,
103
+ "clip_ratio/low_min": 0.0,
104
+ "clip_ratio/region_mean": 0.0,
105
+ "completions/clipped_ratio": 0.0,
106
+ "completions/max_length": 115.3,
107
+ "completions/max_terminated_length": 115.3,
108
+ "completions/mean_length": 70.528125,
109
+ "completions/mean_terminated_length": 70.528125,
110
+ "completions/min_length": 38.1,
111
+ "completions/min_terminated_length": 38.1,
112
+ "entropy": 0.1547975329682231,
113
+ "epoch": 1.25,
114
+ "frac_reward_zero_std": 0.1625,
115
+ "grad_norm": 3.677471399307251,
116
+ "learning_rate": 3.9062499999999997e-07,
117
+ "loss": 0.02682075500488281,
118
+ "num_tokens": 1412518.0,
119
+ "reward": 1.0730250239372254,
120
+ "reward_std": 0.7086882412433624,
121
+ "rewards/pulse_reward/mean": 1.0730250239372254,
122
+ "rewards/pulse_reward/std": 0.7086882352828979,
123
+ "step": 40,
124
+ "step_time": 11.361506971600466,
125
+ "tools/call_frequency": 2.215625,
126
+ "tools/failure_frequency": 0.0015384615398943424
127
+ },
128
+ {
129
+ "clip_ratio/high_max": 0.0,
130
+ "clip_ratio/high_mean": 0.0,
131
+ "clip_ratio/low_mean": 0.0,
132
+ "clip_ratio/low_min": 0.0,
133
+ "clip_ratio/region_mean": 0.0,
134
+ "completions/clipped_ratio": 0.0,
135
+ "completions/max_length": 83.1,
136
+ "completions/max_terminated_length": 83.1,
137
+ "completions/mean_length": 65.421875,
138
+ "completions/mean_terminated_length": 65.421875,
139
+ "completions/min_length": 48.0,
140
+ "completions/min_terminated_length": 48.0,
141
+ "entropy": 0.12452723067253828,
142
+ "epoch": 1.5625,
143
+ "frac_reward_zero_std": 0.6625,
144
+ "grad_norm": 3.609057664871216,
145
+ "learning_rate": 2.3437499999999998e-07,
146
+ "loss": -3.346521407365799e-05,
147
+ "num_tokens": 1741293.0,
148
+ "reward": 1.4852312803268433,
149
+ "reward_std": 0.3715872406959534,
150
+ "rewards/pulse_reward/mean": 1.4852312803268433,
151
+ "rewards/pulse_reward/std": 0.3715872406959534,
152
+ "step": 50,
153
+ "step_time": 8.49522676919878,
154
+ "tools/call_frequency": 2.0,
155
+ "tools/failure_frequency": 0.0
156
+ },
157
+ {
158
+ "clip_ratio/high_max": 0.0,
159
+ "clip_ratio/high_mean": 0.0,
160
+ "clip_ratio/low_mean": 0.0,
161
+ "clip_ratio/low_min": 0.0,
162
+ "clip_ratio/region_mean": 0.0,
163
+ "completions/clipped_ratio": 0.0,
164
+ "completions/max_length": 86.6,
165
+ "completions/max_terminated_length": 86.6,
166
+ "completions/mean_length": 65.471875,
167
+ "completions/mean_terminated_length": 65.471875,
168
+ "completions/min_length": 53.6,
169
+ "completions/min_terminated_length": 53.6,
170
+ "entropy": 0.11676975060254335,
171
+ "epoch": 1.875,
172
+ "frac_reward_zero_std": 0.8125,
173
+ "grad_norm": 2.142204761505127,
174
+ "learning_rate": 7.812499999999999e-08,
175
+ "loss": -0.00026753861457109453,
176
+ "num_tokens": 2070084.0,
177
+ "reward": 1.538143789768219,
178
+ "reward_std": 0.2553505107760429,
179
+ "rewards/pulse_reward/mean": 1.538143789768219,
180
+ "rewards/pulse_reward/std": 0.25535051375627515,
181
+ "step": 60,
182
+ "step_time": 8.590878555196104,
183
+ "tools/call_frequency": 1.996875,
184
+ "tools/failure_frequency": 0.0
185
+ }
186
+ ],
187
+ "logging_steps": 10,
188
+ "max_steps": 64,
189
+ "num_input_tokens_seen": 2201581,
190
+ "num_train_epochs": 2,
191
+ "save_steps": 500,
192
+ "stateful_callbacks": {
193
+ "TrainerControl": {
194
+ "args": {
195
+ "should_epoch_stop": false,
196
+ "should_evaluate": false,
197
+ "should_log": false,
198
+ "should_save": true,
199
+ "should_training_stop": true
200
+ },
201
+ "attributes": {}
202
+ }
203
+ },
204
+ "total_flos": 0.0,
205
+ "train_batch_size": 8,
206
+ "trial_name": null,
207
+ "trial_params": null
208
+ }
checkpoint-64/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beef4bb9265600265ee78a20bc82f4606af28d145e10bb1b99f8c1c25b2dfef3
3
+ size 6776
completions/completions_00010.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10c0189e54b95efcd547bc7428c397934ae26e1dcfff7b8224d763f5d309bf56
3
- size 27095
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ed1e88b120607f7b8cbc21e6d009da8f473f089b81b63a4cc6d2e30f33eda3
3
+ size 31937
completions/completions_00020.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b0276af96688b86a5d64aa8e3321028609a4197484f003ed3731c25c153538
3
+ size 33667
completions/completions_00030.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1499c8f1a195fe8930fd7338a09d90d9f5551f604fcfdab147f034f0ecc71827
3
+ size 28958
completions/completions_00040.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:646c066b43013c281723f0d4c62facd9a369b7025636ba5658a350bab1a5180e
3
+ size 23850
completions/completions_00050.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48aad7bd937639532ebe9dba92c318f0ad86db035761861204516149084a29a
3
+ size 23846
completions/completions_00060.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be67ebb6ca105cca7c9f08e11e8259c50ea4bed9b1a3a00a70f000e14ed38c6d
3
+ size 23589
completions/completions_00064.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2757babd27c9411393c4809b69ca6ee606012f352645a97006503671aa6bcb
3
+ size 23756
metrics/loss_curve.svg CHANGED
metrics/loss_history.json CHANGED
@@ -1,6 +1,26 @@
1
  [
2
  {
3
  "step": 10.0,
4
- "value": -0.015362872183322907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
6
  ]
 
1
  [
2
  {
3
  "step": 10.0,
4
+ "value": -0.03454443216323853
5
+ },
6
+ {
7
+ "step": 20.0,
8
+ "value": 0.01356593519449234
9
+ },
10
+ {
11
+ "step": 30.0,
12
+ "value": 0.06958286762237549
13
+ },
14
+ {
15
+ "step": 40.0,
16
+ "value": 0.02682075500488281
17
+ },
18
+ {
19
+ "step": 50.0,
20
+ "value": -3.346521407365799e-05
21
+ },
22
+ {
23
+ "step": 60.0,
24
+ "value": -0.00026753861457109453
25
  }
26
  ]
metrics/reward_curve.svg CHANGED
metrics/reward_history.json CHANGED
@@ -1,10 +1,30 @@
1
  [
2
  {
3
  "step": 10.0,
4
- "value": -1.56496252566576
5
  },
6
  {
7
- "step": 12.0,
8
- "value": -1.5552499741315842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
10
  ]
 
1
  [
2
  {
3
  "step": 10.0,
4
+ "value": -1.9279687214642762
5
  },
6
  {
7
+ "step": 20.0,
8
+ "value": 0.09853125289082527
9
+ },
10
+ {
11
+ "step": 30.0,
12
+ "value": 0.3879156395792961
13
+ },
14
+ {
15
+ "step": 40.0,
16
+ "value": 1.0730250239372254
17
+ },
18
+ {
19
+ "step": 50.0,
20
+ "value": 1.4852312803268433
21
+ },
22
+ {
23
+ "step": 60.0,
24
+ "value": 1.538143789768219
25
+ },
26
+ {
27
+ "step": 64.0,
28
+ "value": 1.57924222946167
29
  }
30
  ]
run_manifest.json CHANGED
@@ -4,14 +4,14 @@
4
  "env_url": "http://127.0.0.1:8000",
5
  "fp16": false,
6
  "git_commit": "3b3d65241147f5ac61616a235e860943f956b9af",
7
- "gradient_accumulation_steps": 2,
8
  "learning_rate": 1e-06,
9
  "max_steps": 1024,
10
  "model": "Qwen/Qwen3-0.6B",
11
  "num_generations": 4,
12
- "num_samples": 24,
13
- "num_train_epochs": 1.0,
14
- "per_device_train_batch_size": 4,
15
  "scenario_id": "respiratory_distress",
16
  "seed": 42,
17
  "use_cpu": false
 
4
  "env_url": "http://127.0.0.1:8000",
5
  "fp16": false,
6
  "git_commit": "3b3d65241147f5ac61616a235e860943f956b9af",
7
+ "gradient_accumulation_steps": 4,
8
  "learning_rate": 1e-06,
9
  "max_steps": 1024,
10
  "model": "Qwen/Qwen3-0.6B",
11
  "num_generations": 4,
12
+ "num_samples": 256,
13
+ "num_train_epochs": 2.0,
14
+ "per_device_train_batch_size": 8,
15
  "scenario_id": "respiratory_distress",
16
  "seed": 42,
17
  "use_cpu": false
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 12,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16,27 +16,27 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 233.8,
20
- "completions/max_terminated_length": 233.8,
21
- "completions/mean_length": 170.275,
22
- "completions/mean_terminated_length": 170.275,
23
- "completions/min_length": 88.3,
24
- "completions/min_terminated_length": 88.3,
25
- "entropy": 0.40836348086595536,
26
- "epoch": 0.8333333333333334,
27
- "frac_reward_zero_std": 0.15,
28
- "grad_norm": 7.111741542816162,
29
- "learning_rate": 2.5e-07,
30
- "loss": -0.015362872183322907,
31
- "num_tokens": 90582.0,
32
- "reward": -1.56496252566576,
33
- "reward_std": 4.461238273605704,
34
- "rewards/pulse_reward/mean": -1.56496252566576,
35
- "rewards/pulse_reward/std": 4.461238479241729,
36
  "step": 10,
37
- "step_time": 18.493348840194813,
38
- "tools/call_frequency": 4.75,
39
- "tools/failure_frequency": 0.006923890113830567
40
  },
41
  {
42
  "clip_ratio/high_max": 0.0,
@@ -45,35 +45,180 @@
45
  "clip_ratio/low_min": 0.0,
46
  "clip_ratio/region_mean": 0.0,
47
  "completions/clipped_ratio": 0.0,
48
- "completions/max_length": 260.0,
49
- "completions/max_terminated_length": 260.0,
50
- "completions/mean_length": 166.625,
51
- "completions/mean_terminated_length": 166.625,
52
- "completions/min_length": 87.0,
53
- "completions/min_terminated_length": 87.0,
54
- "entropy": 0.47208209335803986,
55
- "epoch": 1.0,
56
- "frac_reward_zero_std": 0.0,
57
- "num_tokens": 108640.0,
58
- "reward": -1.5552499741315842,
59
- "reward_std": 5.09348089993,
60
- "rewards/pulse_reward/mean": -1.5552499741315842,
61
- "rewards/pulse_reward/std": 5.093480907380581,
62
- "step": 12,
63
- "step_time": 25.044593099497433,
64
- "tools/call_frequency": 4.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "tools/failure_frequency": 0.0,
66
  "total_flos": 0.0,
67
- "train_loss": -0.013816780100266138,
68
- "train_runtime": 244.674,
69
- "train_samples_per_second": 0.098,
70
- "train_steps_per_second": 0.049
71
  }
72
  ],
73
  "logging_steps": 10,
74
- "max_steps": 12,
75
- "num_input_tokens_seen": 108640,
76
- "num_train_epochs": 1,
77
  "save_steps": 500,
78
  "stateful_callbacks": {
79
  "TrainerControl": {
@@ -88,7 +233,7 @@
88
  }
89
  },
90
  "total_flos": 0.0,
91
- "train_batch_size": 4,
92
  "trial_name": null,
93
  "trial_params": null
94
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 64,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 281.7,
20
+ "completions/max_terminated_length": 281.7,
21
+ "completions/mean_length": 173.90625,
22
+ "completions/mean_terminated_length": 173.90625,
23
+ "completions/min_length": 60.1,
24
+ "completions/min_terminated_length": 60.1,
25
+ "entropy": 0.34981602281332014,
26
+ "epoch": 0.3125,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 3.5286595821380615,
29
+ "learning_rate": 8.593749999999999e-07,
30
+ "loss": -0.03454443216323853,
31
+ "num_tokens": 363490.0,
32
+ "reward": -1.9279687214642762,
33
+ "reward_std": 6.467041900753975,
34
+ "rewards/pulse_reward/mean": -1.9279687214642762,
35
+ "rewards/pulse_reward/std": 6.467041908204555,
36
  "step": 10,
37
+ "step_time": 35.20241980789724,
38
+ "tools/call_frequency": 5.49375,
39
+ "tools/failure_frequency": 0.019680690905079245
40
  },
41
  {
42
  "clip_ratio/high_max": 0.0,
 
45
  "clip_ratio/low_min": 0.0,
46
  "clip_ratio/region_mean": 0.0,
47
  "completions/clipped_ratio": 0.0,
48
+ "completions/max_length": 294.3,
49
+ "completions/max_terminated_length": 294.3,
50
+ "completions/mean_length": 172.56875,
51
+ "completions/mean_terminated_length": 172.56875,
52
+ "completions/min_length": 50.3,
53
+ "completions/min_terminated_length": 50.3,
54
+ "entropy": 0.23517516404390335,
55
+ "epoch": 0.625,
56
+ "frac_reward_zero_std": 0.2625,
57
+ "grad_norm": 3.8192903995513916,
58
+ "learning_rate": 7.031249999999999e-07,
59
+ "loss": 0.01356593519449234,
60
+ "num_tokens": 726552.0,
61
+ "reward": 0.09853125289082527,
62
+ "reward_std": 0.583341383934021,
63
+ "rewards/pulse_reward/mean": 0.09853125289082527,
64
+ "rewards/pulse_reward/std": 0.583341383934021,
65
+ "step": 20,
66
+ "step_time": 39.57589099079487,
67
+ "tools/call_frequency": 6.35,
68
+ "tools/failure_frequency": 0.005996426660567522
69
+ },
70
+ {
71
+ "clip_ratio/high_max": 0.0,
72
+ "clip_ratio/high_mean": 0.0,
73
+ "clip_ratio/low_mean": 0.0,
74
+ "clip_ratio/low_min": 0.0,
75
+ "clip_ratio/region_mean": 0.0,
76
+ "completions/clipped_ratio": 0.0,
77
+ "completions/max_length": 273.0,
78
+ "completions/max_terminated_length": 273.0,
79
+ "completions/mean_length": 149.115625,
80
+ "completions/mean_terminated_length": 149.115625,
81
+ "completions/min_length": 50.9,
82
+ "completions/min_terminated_length": 50.9,
83
+ "entropy": 0.1797773003578186,
84
+ "epoch": 0.9375,
85
+ "frac_reward_zero_std": 0.0875,
86
+ "grad_norm": 5.692322254180908,
87
+ "learning_rate": 5.46875e-07,
88
+ "loss": 0.06958286762237549,
89
+ "num_tokens": 1082109.0,
90
+ "reward": 0.3879156395792961,
91
+ "reward_std": 0.6661841243505477,
92
+ "rewards/pulse_reward/mean": 0.3879156395792961,
93
+ "rewards/pulse_reward/std": 0.6661841526627541,
94
+ "step": 30,
95
+ "step_time": 30.443527194001945,
96
+ "tools/call_frequency": 5.58125,
97
+ "tools/failure_frequency": 0.005842319130897522
98
+ },
99
+ {
100
+ "clip_ratio/high_max": 0.0,
101
+ "clip_ratio/high_mean": 0.0,
102
+ "clip_ratio/low_mean": 0.0,
103
+ "clip_ratio/low_min": 0.0,
104
+ "clip_ratio/region_mean": 0.0,
105
+ "completions/clipped_ratio": 0.0,
106
+ "completions/max_length": 115.3,
107
+ "completions/max_terminated_length": 115.3,
108
+ "completions/mean_length": 70.528125,
109
+ "completions/mean_terminated_length": 70.528125,
110
+ "completions/min_length": 38.1,
111
+ "completions/min_terminated_length": 38.1,
112
+ "entropy": 0.1547975329682231,
113
+ "epoch": 1.25,
114
+ "frac_reward_zero_std": 0.1625,
115
+ "grad_norm": 3.677471399307251,
116
+ "learning_rate": 3.9062499999999997e-07,
117
+ "loss": 0.02682075500488281,
118
+ "num_tokens": 1412518.0,
119
+ "reward": 1.0730250239372254,
120
+ "reward_std": 0.7086882412433624,
121
+ "rewards/pulse_reward/mean": 1.0730250239372254,
122
+ "rewards/pulse_reward/std": 0.7086882352828979,
123
+ "step": 40,
124
+ "step_time": 11.361506971600466,
125
+ "tools/call_frequency": 2.215625,
126
+ "tools/failure_frequency": 0.0015384615398943424
127
+ },
128
+ {
129
+ "clip_ratio/high_max": 0.0,
130
+ "clip_ratio/high_mean": 0.0,
131
+ "clip_ratio/low_mean": 0.0,
132
+ "clip_ratio/low_min": 0.0,
133
+ "clip_ratio/region_mean": 0.0,
134
+ "completions/clipped_ratio": 0.0,
135
+ "completions/max_length": 83.1,
136
+ "completions/max_terminated_length": 83.1,
137
+ "completions/mean_length": 65.421875,
138
+ "completions/mean_terminated_length": 65.421875,
139
+ "completions/min_length": 48.0,
140
+ "completions/min_terminated_length": 48.0,
141
+ "entropy": 0.12452723067253828,
142
+ "epoch": 1.5625,
143
+ "frac_reward_zero_std": 0.6625,
144
+ "grad_norm": 3.609057664871216,
145
+ "learning_rate": 2.3437499999999998e-07,
146
+ "loss": -3.346521407365799e-05,
147
+ "num_tokens": 1741293.0,
148
+ "reward": 1.4852312803268433,
149
+ "reward_std": 0.3715872406959534,
150
+ "rewards/pulse_reward/mean": 1.4852312803268433,
151
+ "rewards/pulse_reward/std": 0.3715872406959534,
152
+ "step": 50,
153
+ "step_time": 8.49522676919878,
154
+ "tools/call_frequency": 2.0,
155
+ "tools/failure_frequency": 0.0
156
+ },
157
+ {
158
+ "clip_ratio/high_max": 0.0,
159
+ "clip_ratio/high_mean": 0.0,
160
+ "clip_ratio/low_mean": 0.0,
161
+ "clip_ratio/low_min": 0.0,
162
+ "clip_ratio/region_mean": 0.0,
163
+ "completions/clipped_ratio": 0.0,
164
+ "completions/max_length": 86.6,
165
+ "completions/max_terminated_length": 86.6,
166
+ "completions/mean_length": 65.471875,
167
+ "completions/mean_terminated_length": 65.471875,
168
+ "completions/min_length": 53.6,
169
+ "completions/min_terminated_length": 53.6,
170
+ "entropy": 0.11676975060254335,
171
+ "epoch": 1.875,
172
+ "frac_reward_zero_std": 0.8125,
173
+ "grad_norm": 2.142204761505127,
174
+ "learning_rate": 7.812499999999999e-08,
175
+ "loss": -0.00026753861457109453,
176
+ "num_tokens": 2070084.0,
177
+ "reward": 1.538143789768219,
178
+ "reward_std": 0.2553505107760429,
179
+ "rewards/pulse_reward/mean": 1.538143789768219,
180
+ "rewards/pulse_reward/std": 0.25535051375627515,
181
+ "step": 60,
182
+ "step_time": 8.590878555196104,
183
+ "tools/call_frequency": 1.996875,
184
+ "tools/failure_frequency": 0.0
185
+ },
186
+ {
187
+ "clip_ratio/high_max": 0.0,
188
+ "clip_ratio/high_mean": 0.0,
189
+ "clip_ratio/low_mean": 0.0,
190
+ "clip_ratio/low_min": 0.0,
191
+ "clip_ratio/region_mean": 0.0,
192
+ "completions/clipped_ratio": 0.0,
193
+ "completions/max_length": 74.75,
194
+ "completions/max_terminated_length": 74.75,
195
+ "completions/mean_length": 65.3203125,
196
+ "completions/mean_terminated_length": 65.3203125,
197
+ "completions/min_length": 61.25,
198
+ "completions/min_terminated_length": 61.25,
199
+ "entropy": 0.11779927462339401,
200
+ "epoch": 2.0,
201
+ "frac_reward_zero_std": 0.9375,
202
+ "num_tokens": 2201581.0,
203
+ "reward": 1.57924222946167,
204
+ "reward_std": 0.11742392182350159,
205
+ "rewards/pulse_reward/mean": 1.57924222946167,
206
+ "rewards/pulse_reward/std": 0.11742392182350159,
207
+ "step": 64,
208
+ "step_time": 7.905290340990177,
209
+ "tools/call_frequency": 2.0078125,
210
  "tools/failure_frequency": 0.0,
211
  "total_flos": 0.0,
212
+ "train_loss": 0.011853500996949151,
213
+ "train_runtime": 1404.4837,
214
+ "train_samples_per_second": 0.365,
215
+ "train_steps_per_second": 0.046
216
  }
217
  ],
218
  "logging_steps": 10,
219
+ "max_steps": 64,
220
+ "num_input_tokens_seen": 2201581,
221
+ "num_train_epochs": 2,
222
  "save_steps": 500,
223
  "stateful_callbacks": {
224
  "TrainerControl": {
 
233
  }
234
  },
235
  "total_flos": 0.0,
236
+ "train_batch_size": 8,
237
  "trial_name": null,
238
  "trial_params": null
239
  }