reshinthadith commited on
Commit
725cd6e
·
verified ·
1 Parent(s): 6164439

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "float32",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention"
43
+ ],
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "model_type": "qwen3",
47
+ "num_attention_heads": 16,
48
+ "num_hidden_layers": 28,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 151643,
51
+ "rms_norm_eps": 1e-06,
52
+ "rope_scaling": null,
53
+ "rope_theta": 1000000,
54
+ "sliding_window": null,
55
+ "tie_word_embeddings": true,
56
+ "transformers_version": "4.57.3",
57
+ "use_cache": true,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 151936
60
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645
5
+ ],
6
+ "max_new_tokens": 256,
7
+ "no_repeat_ngram_size": 3,
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.2,
10
+ "temperature": 0.8,
11
+ "top_p": 0.9,
12
+ "transformers_version": "4.57.3"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12653519e7d39024353f4a93f19edf37f54caf6ffc07c68b6be6b33cb93f0d98
3
+ size 2384234968
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f5d31ce3f94b227a9d4aa959feefa425365447269aff6899cca46566d6ead4
3
+ size 4768669395
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35059eaf262b99e25d2c89d034a29d2e08b6edbee1cb70e6dc4f5d5c11035ca2
3
+ size 15877
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fe6a4b07713b30a25984a8dfee04d369213e2b4f829b83a3b5317cad7b9545a
3
+ size 15877
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d76f48ea09dc133c9ecbc6e1c2ed10f9450cfd601b85b1ae1b0c11c8427a7db
3
+ size 15877
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:625137f2eb94fef4201a4c4649e55b1db1f7f74a2aeb909aaaaf7dc793a74ce5
3
+ size 15941
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:428bdf950c87573bc2ea172844984571a5b17561d48f684620ca1e992f2d2c7c
3
+ size 15877
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efdde4beb41e3d4ddbb9aba88a5ac0b95b701500c1e28655ce4c10888b4ae952
3
+ size 15877
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bc307f08d8a3dc6e024728ee9eed82e329c09ca8e6fe88c614e51e39f1bca91
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67cc0080ffd7555f723f423c27cfef314e1ad9d335c8b79f465c5faba1ed478b
3
+ size 11422821
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
trainer_state.json ADDED
@@ -0,0 +1,2274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8,
6
+ "eval_steps": 500,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.6791666865348815,
19
+ "completions/max_length": 256.0,
20
+ "completions/max_terminated_length": 247.0,
21
+ "completions/mean_length": 236.5416717529297,
22
+ "completions/mean_terminated_length": 200.61068420410157,
23
+ "completions/min_length": 127.2,
24
+ "completions/min_terminated_length": 127.2,
25
+ "entropy": 1.070425021648407,
26
+ "epoch": 0.01,
27
+ "frac_reward_zero_std": 0.0,
28
+ "grad_norm": 3.5985119342803955,
29
+ "kl": 0.0035211448557674885,
30
+ "learning_rate": 4.980000000000001e-06,
31
+ "loss": 0.0011,
32
+ "num_tokens": 73826.0,
33
+ "reward": 0.2776068687438965,
34
+ "reward_std": 0.5232907950878143,
35
+ "rewards/politeness_reward_func/mean": 0.27760685607790947,
36
+ "rewards/politeness_reward_func/std": 0.5940477013587951,
37
+ "step": 5,
38
+ "step_time": 11.145096338540316
39
+ },
40
+ {
41
+ "clip_ratio/high_max": 0.0,
42
+ "clip_ratio/high_mean": 0.0,
43
+ "clip_ratio/low_mean": 0.0,
44
+ "clip_ratio/low_min": 0.0,
45
+ "clip_ratio/region_mean": 0.0,
46
+ "completions/clipped_ratio": 0.6208333492279052,
47
+ "completions/max_length": 256.0,
48
+ "completions/max_terminated_length": 246.8,
49
+ "completions/mean_length": 232.80417175292968,
50
+ "completions/mean_terminated_length": 204.57913513183593,
51
+ "completions/min_length": 148.6,
52
+ "completions/min_terminated_length": 148.6,
53
+ "entropy": 1.070945155620575,
54
+ "epoch": 0.02,
55
+ "frac_reward_zero_std": 0.0,
56
+ "grad_norm": 3.3468329906463623,
57
+ "kl": 0.02206921111792326,
58
+ "learning_rate": 4.955e-06,
59
+ "loss": 0.0224,
60
+ "num_tokens": 147283.0,
61
+ "reward": 0.6233123302459717,
62
+ "reward_std": 0.4648557245731354,
63
+ "rewards/politeness_reward_func/mean": 0.6233122944831848,
64
+ "rewards/politeness_reward_func/std": 0.553675526380539,
65
+ "step": 10,
66
+ "step_time": 10.835208699852227
67
+ },
68
+ {
69
+ "clip_ratio/high_max": 0.0,
70
+ "clip_ratio/high_mean": 0.0,
71
+ "clip_ratio/low_mean": 0.0,
72
+ "clip_ratio/low_min": 0.0,
73
+ "clip_ratio/region_mean": 0.0,
74
+ "completions/clipped_ratio": 0.5000000119209289,
75
+ "completions/max_length": 256.0,
76
+ "completions/max_terminated_length": 249.6,
77
+ "completions/mean_length": 221.8000030517578,
78
+ "completions/mean_terminated_length": 188.9316619873047,
79
+ "completions/min_length": 110.8,
80
+ "completions/min_terminated_length": 110.8,
81
+ "entropy": 1.1163462162017823,
82
+ "epoch": 0.03,
83
+ "frac_reward_zero_std": 0.0,
84
+ "grad_norm": 3.3379204273223877,
85
+ "kl": 0.04084419272840023,
86
+ "learning_rate": 4.93e-06,
87
+ "loss": -0.0052,
88
+ "num_tokens": 218131.0,
89
+ "reward": 0.8122925400733948,
90
+ "reward_std": 0.41203114986419676,
91
+ "rewards/politeness_reward_func/mean": 0.8122925400733948,
92
+ "rewards/politeness_reward_func/std": 0.4989388585090637,
93
+ "step": 15,
94
+ "step_time": 10.806195054203272
95
+ },
96
+ {
97
+ "clip_ratio/high_max": 0.0,
98
+ "clip_ratio/high_mean": 0.0,
99
+ "clip_ratio/low_mean": 0.0,
100
+ "clip_ratio/low_min": 0.0,
101
+ "clip_ratio/region_mean": 0.0,
102
+ "completions/clipped_ratio": 0.6791666865348815,
103
+ "completions/max_length": 256.0,
104
+ "completions/max_terminated_length": 251.6,
105
+ "completions/mean_length": 238.19584045410156,
106
+ "completions/mean_terminated_length": 201.80428771972657,
107
+ "completions/min_length": 130.2,
108
+ "completions/min_terminated_length": 130.2,
109
+ "entropy": 1.1251786351203918,
110
+ "epoch": 0.04,
111
+ "frac_reward_zero_std": 0.0,
112
+ "grad_norm": 3.3837928771972656,
113
+ "kl": 0.0385904410853982,
114
+ "learning_rate": 4.9050000000000005e-06,
115
+ "loss": -0.0031,
116
+ "num_tokens": 292306.0,
117
+ "reward": 0.6734941720962524,
118
+ "reward_std": 0.448628181219101,
119
+ "rewards/politeness_reward_func/mean": 0.6734941720962524,
120
+ "rewards/politeness_reward_func/std": 0.509108817577362,
121
+ "step": 20,
122
+ "step_time": 10.808321141451597
123
+ },
124
+ {
125
+ "clip_ratio/high_max": 0.0,
126
+ "clip_ratio/high_mean": 0.0,
127
+ "clip_ratio/low_mean": 0.0,
128
+ "clip_ratio/low_min": 0.0,
129
+ "clip_ratio/region_mean": 0.0,
130
+ "completions/clipped_ratio": 0.7333333551883697,
131
+ "completions/max_length": 256.0,
132
+ "completions/max_terminated_length": 246.6,
133
+ "completions/mean_length": 239.7041748046875,
134
+ "completions/mean_terminated_length": 201.94395141601564,
135
+ "completions/min_length": 144.0,
136
+ "completions/min_terminated_length": 144.0,
137
+ "entropy": 1.0987902224063872,
138
+ "epoch": 0.05,
139
+ "frac_reward_zero_std": 0.0,
140
+ "grad_norm": 3.3908112049102783,
141
+ "kl": 0.04714170750230551,
142
+ "learning_rate": 4.880000000000001e-06,
143
+ "loss": 0.013,
144
+ "num_tokens": 366731.0,
145
+ "reward": 0.7238378047943115,
146
+ "reward_std": 0.5012153267860413,
147
+ "rewards/politeness_reward_func/mean": 0.7238377809524537,
148
+ "rewards/politeness_reward_func/std": 0.5635832965373992,
149
+ "step": 25,
150
+ "step_time": 10.778460966795683
151
+ },
152
+ {
153
+ "clip_ratio/high_max": 0.0,
154
+ "clip_ratio/high_mean": 0.0,
155
+ "clip_ratio/low_mean": 0.0,
156
+ "clip_ratio/low_min": 0.0,
157
+ "clip_ratio/region_mean": 0.0,
158
+ "completions/clipped_ratio": 0.7041666984558106,
159
+ "completions/max_length": 256.0,
160
+ "completions/max_terminated_length": 251.4,
161
+ "completions/mean_length": 238.33334045410157,
162
+ "completions/mean_terminated_length": 209.13802490234374,
163
+ "completions/min_length": 135.6,
164
+ "completions/min_terminated_length": 135.6,
165
+ "entropy": 1.0942281484603882,
166
+ "epoch": 0.06,
167
+ "frac_reward_zero_std": 0.0,
168
+ "grad_norm": 3.272942304611206,
169
+ "kl": 0.04730561040341854,
170
+ "learning_rate": 4.855e-06,
171
+ "loss": -0.0003,
172
+ "num_tokens": 441211.0,
173
+ "reward": 0.7387123763561249,
174
+ "reward_std": 0.3938853979110718,
175
+ "rewards/politeness_reward_func/mean": 0.7387123882770539,
176
+ "rewards/politeness_reward_func/std": 0.5152743279933929,
177
+ "step": 30,
178
+ "step_time": 10.791689745336772
179
+ },
180
+ {
181
+ "clip_ratio/high_max": 0.0,
182
+ "clip_ratio/high_mean": 0.0,
183
+ "clip_ratio/low_mean": 0.0,
184
+ "clip_ratio/low_min": 0.0,
185
+ "clip_ratio/region_mean": 0.0,
186
+ "completions/clipped_ratio": 0.6000000178813935,
187
+ "completions/max_length": 256.0,
188
+ "completions/max_terminated_length": 250.8,
189
+ "completions/mean_length": 232.95417175292968,
190
+ "completions/mean_terminated_length": 200.11821594238282,
191
+ "completions/min_length": 126.0,
192
+ "completions/min_terminated_length": 126.0,
193
+ "entropy": 1.1196449041366576,
194
+ "epoch": 0.07,
195
+ "frac_reward_zero_std": 0.0,
196
+ "grad_norm": 3.515143394470215,
197
+ "kl": 0.055315449833869934,
198
+ "learning_rate": 4.83e-06,
199
+ "loss": 0.0018,
200
+ "num_tokens": 514144.0,
201
+ "reward": 0.9808868050575257,
202
+ "reward_std": 0.3419617712497711,
203
+ "rewards/politeness_reward_func/mean": 0.9808867692947387,
204
+ "rewards/politeness_reward_func/std": 0.4772939085960388,
205
+ "step": 35,
206
+ "step_time": 10.767037071287632
207
+ },
208
+ {
209
+ "clip_ratio/high_max": 0.0,
210
+ "clip_ratio/high_mean": 0.0,
211
+ "clip_ratio/low_mean": 0.0,
212
+ "clip_ratio/low_min": 0.0,
213
+ "clip_ratio/region_mean": 0.0,
214
+ "completions/clipped_ratio": 0.4333333432674408,
215
+ "completions/max_length": 256.0,
216
+ "completions/max_terminated_length": 253.2,
217
+ "completions/mean_length": 216.89583740234374,
218
+ "completions/mean_terminated_length": 186.88653869628905,
219
+ "completions/min_length": 90.4,
220
+ "completions/min_terminated_length": 90.4,
221
+ "entropy": 1.062067198753357,
222
+ "epoch": 0.08,
223
+ "frac_reward_zero_std": 0.0,
224
+ "grad_norm": 3.2335519790649414,
225
+ "kl": 0.06222313120961189,
226
+ "learning_rate": 4.805000000000001e-06,
227
+ "loss": 0.0425,
228
+ "num_tokens": 583095.0,
229
+ "reward": 0.873325777053833,
230
+ "reward_std": 0.4448536515235901,
231
+ "rewards/politeness_reward_func/mean": 0.873325777053833,
232
+ "rewards/politeness_reward_func/std": 0.49484293460845946,
233
+ "step": 40,
234
+ "step_time": 10.696332201361656
235
+ },
236
+ {
237
+ "clip_ratio/high_max": 0.0,
238
+ "clip_ratio/high_mean": 0.0,
239
+ "clip_ratio/low_mean": 0.0,
240
+ "clip_ratio/low_min": 0.0,
241
+ "clip_ratio/region_mean": 0.0,
242
+ "completions/clipped_ratio": 0.5666666746139526,
243
+ "completions/max_length": 256.0,
244
+ "completions/max_terminated_length": 248.6,
245
+ "completions/mean_length": 220.82500610351562,
246
+ "completions/mean_terminated_length": 175.39512634277344,
247
+ "completions/min_length": 75.6,
248
+ "completions/min_terminated_length": 75.6,
249
+ "entropy": 1.0480997025966645,
250
+ "epoch": 0.09,
251
+ "frac_reward_zero_std": 0.0,
252
+ "grad_norm": 3.452233076095581,
253
+ "kl": 0.07400040999054909,
254
+ "learning_rate": 4.78e-06,
255
+ "loss": 0.0107,
256
+ "num_tokens": 653373.0,
257
+ "reward": 0.9472709178924561,
258
+ "reward_std": 0.40665341913700104,
259
+ "rewards/politeness_reward_func/mean": 0.9472708940505982,
260
+ "rewards/politeness_reward_func/std": 0.4510787308216095,
261
+ "step": 45,
262
+ "step_time": 10.80831560343504
263
+ },
264
+ {
265
+ "clip_ratio/high_max": 0.0,
266
+ "clip_ratio/high_mean": 0.0,
267
+ "clip_ratio/low_mean": 0.0,
268
+ "clip_ratio/low_min": 0.0,
269
+ "clip_ratio/region_mean": 0.0,
270
+ "completions/clipped_ratio": 0.5416666865348816,
271
+ "completions/max_length": 256.0,
272
+ "completions/max_terminated_length": 249.8,
273
+ "completions/mean_length": 208.1916717529297,
274
+ "completions/mean_terminated_length": 153.92529907226563,
275
+ "completions/min_length": 61.8,
276
+ "completions/min_terminated_length": 61.8,
277
+ "entropy": 1.1236772537231445,
278
+ "epoch": 0.1,
279
+ "frac_reward_zero_std": 0.0,
280
+ "grad_norm": 3.160369634628296,
281
+ "kl": 0.08055943846702576,
282
+ "learning_rate": 4.755e-06,
283
+ "loss": 0.0611,
284
+ "num_tokens": 720619.0,
285
+ "reward": 0.8770254850387573,
286
+ "reward_std": 0.5482801616191864,
287
+ "rewards/politeness_reward_func/mean": 0.8770254492759705,
288
+ "rewards/politeness_reward_func/std": 0.5923300921916962,
289
+ "step": 50,
290
+ "step_time": 10.79710350483656
291
+ },
292
+ {
293
+ "clip_ratio/high_max": 0.0,
294
+ "clip_ratio/high_mean": 0.0,
295
+ "clip_ratio/low_mean": 0.0,
296
+ "clip_ratio/low_min": 0.0,
297
+ "clip_ratio/region_mean": 0.0,
298
+ "completions/clipped_ratio": 0.4083333373069763,
299
+ "completions/max_length": 256.0,
300
+ "completions/max_terminated_length": 245.4,
301
+ "completions/mean_length": 180.80833740234374,
302
+ "completions/mean_terminated_length": 134.77063598632813,
303
+ "completions/min_length": 46.0,
304
+ "completions/min_terminated_length": 46.0,
305
+ "entropy": 1.2555986642837524,
306
+ "epoch": 0.11,
307
+ "frac_reward_zero_std": 0.0,
308
+ "grad_norm": 5.3433427810668945,
309
+ "kl": 0.09562707468867301,
310
+ "learning_rate": 4.7300000000000005e-06,
311
+ "loss": 0.0773,
312
+ "num_tokens": 781261.0,
313
+ "reward": 1.092347764968872,
314
+ "reward_std": 0.4649295687675476,
315
+ "rewards/politeness_reward_func/mean": 1.0923476934432983,
316
+ "rewards/politeness_reward_func/std": 0.5502241551876068,
317
+ "step": 55,
318
+ "step_time": 10.756779878586531
319
+ },
320
+ {
321
+ "clip_ratio/high_max": 0.0,
322
+ "clip_ratio/high_mean": 0.0,
323
+ "clip_ratio/low_mean": 0.0,
324
+ "clip_ratio/low_min": 0.0,
325
+ "clip_ratio/region_mean": 0.0,
326
+ "completions/clipped_ratio": 0.2875000063329935,
327
+ "completions/max_length": 256.0,
328
+ "completions/max_terminated_length": 238.8,
329
+ "completions/mean_length": 154.1916702270508,
330
+ "completions/mean_terminated_length": 119.38919677734376,
331
+ "completions/min_length": 42.8,
332
+ "completions/min_terminated_length": 42.8,
333
+ "entropy": 1.3319262266159058,
334
+ "epoch": 0.12,
335
+ "frac_reward_zero_std": 0.0,
336
+ "grad_norm": 3.3366644382476807,
337
+ "kl": 0.14686549603939056,
338
+ "learning_rate": 4.705e-06,
339
+ "loss": 0.0456,
340
+ "num_tokens": 835835.0,
341
+ "reward": 1.4423248767852783,
342
+ "reward_std": 0.36013287007808686,
343
+ "rewards/politeness_reward_func/mean": 1.4423248767852783,
344
+ "rewards/politeness_reward_func/std": 0.40978127121925356,
345
+ "step": 60,
346
+ "step_time": 10.825969719141721
347
+ },
348
+ {
349
+ "clip_ratio/high_max": 0.0,
350
+ "clip_ratio/high_mean": 0.0,
351
+ "clip_ratio/low_mean": 0.0,
352
+ "clip_ratio/low_min": 0.0,
353
+ "clip_ratio/region_mean": 0.0,
354
+ "completions/clipped_ratio": 0.41250001192092894,
355
+ "completions/max_length": 256.0,
356
+ "completions/max_terminated_length": 247.4,
357
+ "completions/mean_length": 183.07083740234376,
358
+ "completions/mean_terminated_length": 133.48724670410155,
359
+ "completions/min_length": 46.8,
360
+ "completions/min_terminated_length": 46.8,
361
+ "entropy": 1.3182121157646178,
362
+ "epoch": 0.13,
363
+ "frac_reward_zero_std": 0.0,
364
+ "grad_norm": 25.555892944335938,
365
+ "kl": 0.17301606237888337,
366
+ "learning_rate": 4.680000000000001e-06,
367
+ "loss": 0.0985,
368
+ "num_tokens": 896732.0,
369
+ "reward": 1.3051372289657592,
370
+ "reward_std": 0.43976340293884275,
371
+ "rewards/politeness_reward_func/mean": 1.3051372170448303,
372
+ "rewards/politeness_reward_func/std": 0.5006832242012024,
373
+ "step": 65,
374
+ "step_time": 10.645895641297102
375
+ },
376
+ {
377
+ "clip_ratio/high_max": 0.0,
378
+ "clip_ratio/high_mean": 0.0,
379
+ "clip_ratio/low_mean": 0.0,
380
+ "clip_ratio/low_min": 0.0,
381
+ "clip_ratio/region_mean": 0.0,
382
+ "completions/clipped_ratio": 0.2708333432674408,
383
+ "completions/max_length": 256.0,
384
+ "completions/max_terminated_length": 246.4,
385
+ "completions/mean_length": 157.8791717529297,
386
+ "completions/mean_terminated_length": 123.51410827636718,
387
+ "completions/min_length": 41.4,
388
+ "completions/min_terminated_length": 41.4,
389
+ "entropy": 1.2897038578987121,
390
+ "epoch": 0.14,
391
+ "frac_reward_zero_std": 0.0,
392
+ "grad_norm": 3.8760225772857666,
393
+ "kl": 0.19847530573606492,
394
+ "learning_rate": 4.655e-06,
395
+ "loss": 0.1906,
396
+ "num_tokens": 951999.0,
397
+ "reward": 1.420637583732605,
398
+ "reward_std": 0.430054372549057,
399
+ "rewards/politeness_reward_func/mean": 1.420637583732605,
400
+ "rewards/politeness_reward_func/std": 0.48181390166282656,
401
+ "step": 70,
402
+ "step_time": 10.596996573358775
403
+ },
404
+ {
405
+ "clip_ratio/high_max": 0.0,
406
+ "clip_ratio/high_mean": 0.0,
407
+ "clip_ratio/low_mean": 0.0,
408
+ "clip_ratio/low_min": 0.0,
409
+ "clip_ratio/region_mean": 0.0,
410
+ "completions/clipped_ratio": 0.2083333432674408,
411
+ "completions/max_length": 256.0,
412
+ "completions/max_terminated_length": 240.4,
413
+ "completions/mean_length": 146.60000610351562,
414
+ "completions/mean_terminated_length": 118.93135223388671,
415
+ "completions/min_length": 43.2,
416
+ "completions/min_terminated_length": 43.2,
417
+ "entropy": 1.4299420952796935,
418
+ "epoch": 0.15,
419
+ "frac_reward_zero_std": 0.0,
420
+ "grad_norm": 4.174029350280762,
421
+ "kl": 0.20222627967596055,
422
+ "learning_rate": 4.6300000000000006e-06,
423
+ "loss": 0.1187,
424
+ "num_tokens": 1004287.0,
425
+ "reward": 1.5081899881362915,
426
+ "reward_std": 0.33264915347099305,
427
+ "rewards/politeness_reward_func/mean": 1.508189868927002,
428
+ "rewards/politeness_reward_func/std": 0.36398229002952576,
429
+ "step": 75,
430
+ "step_time": 10.52198056280613
431
+ },
432
+ {
433
+ "clip_ratio/high_max": 0.0,
434
+ "clip_ratio/high_mean": 0.0,
435
+ "clip_ratio/low_mean": 0.0,
436
+ "clip_ratio/low_min": 0.0,
437
+ "clip_ratio/region_mean": 0.0,
438
+ "completions/clipped_ratio": 0.12916667088866235,
439
+ "completions/max_length": 256.0,
440
+ "completions/max_terminated_length": 242.6,
441
+ "completions/mean_length": 142.55417175292968,
442
+ "completions/mean_terminated_length": 126.68900451660156,
443
+ "completions/min_length": 48.2,
444
+ "completions/min_terminated_length": 48.2,
445
+ "entropy": 1.3346869945526123,
446
+ "epoch": 0.16,
447
+ "frac_reward_zero_std": 0.0,
448
+ "grad_norm": 6.63394021987915,
449
+ "kl": 0.19363965839147568,
450
+ "learning_rate": 4.605000000000001e-06,
451
+ "loss": 0.0898,
452
+ "num_tokens": 1056308.0,
453
+ "reward": 1.459042239189148,
454
+ "reward_std": 0.2736098259687424,
455
+ "rewards/politeness_reward_func/mean": 1.45904221534729,
456
+ "rewards/politeness_reward_func/std": 0.3277650147676468,
457
+ "step": 80,
458
+ "step_time": 10.517034325748682
459
+ },
460
+ {
461
+ "clip_ratio/high_max": 0.0,
462
+ "clip_ratio/high_mean": 0.0,
463
+ "clip_ratio/low_mean": 0.0,
464
+ "clip_ratio/low_min": 0.0,
465
+ "clip_ratio/region_mean": 0.0,
466
+ "completions/clipped_ratio": 0.20416666716337203,
467
+ "completions/max_length": 256.0,
468
+ "completions/max_terminated_length": 241.2,
469
+ "completions/mean_length": 137.8166717529297,
470
+ "completions/mean_terminated_length": 107.75043640136718,
471
+ "completions/min_length": 39.2,
472
+ "completions/min_terminated_length": 39.2,
473
+ "entropy": 1.0585298597812653,
474
+ "epoch": 0.17,
475
+ "frac_reward_zero_std": 0.0,
476
+ "grad_norm": 6.915429592132568,
477
+ "kl": 0.2862268269062042,
478
+ "learning_rate": 4.58e-06,
479
+ "loss": 0.2369,
480
+ "num_tokens": 1106584.0,
481
+ "reward": 1.339914321899414,
482
+ "reward_std": 0.42014909982681276,
483
+ "rewards/politeness_reward_func/mean": 1.3399142742156982,
484
+ "rewards/politeness_reward_func/std": 0.46948710083961487,
485
+ "step": 85,
486
+ "step_time": 10.558716806769372
487
+ },
488
+ {
489
+ "clip_ratio/high_max": 0.0,
490
+ "clip_ratio/high_mean": 0.0,
491
+ "clip_ratio/low_mean": 0.0,
492
+ "clip_ratio/low_min": 0.0,
493
+ "clip_ratio/region_mean": 0.0,
494
+ "completions/clipped_ratio": 0.26250001005828383,
495
+ "completions/max_length": 256.0,
496
+ "completions/max_terminated_length": 232.6,
497
+ "completions/mean_length": 154.93750457763673,
498
+ "completions/mean_terminated_length": 121.31997375488281,
499
+ "completions/min_length": 47.0,
500
+ "completions/min_terminated_length": 47.0,
501
+ "entropy": 1.208097517490387,
502
+ "epoch": 0.18,
503
+ "frac_reward_zero_std": 0.0,
504
+ "grad_norm": 4.7058539390563965,
505
+ "kl": 0.17084270417690278,
506
+ "learning_rate": 4.5550000000000004e-06,
507
+ "loss": 0.0854,
508
+ "num_tokens": 1161209.0,
509
+ "reward": 1.4180654048919679,
510
+ "reward_std": 0.24261603355407715,
511
+ "rewards/politeness_reward_func/mean": 1.4180653095245361,
512
+ "rewards/politeness_reward_func/std": 0.3126596033573151,
513
+ "step": 90,
514
+ "step_time": 10.573557010293007
515
+ },
516
+ {
517
+ "clip_ratio/high_max": 0.0,
518
+ "clip_ratio/high_mean": 0.0,
519
+ "clip_ratio/low_mean": 0.0,
520
+ "clip_ratio/low_min": 0.0,
521
+ "clip_ratio/region_mean": 0.0,
522
+ "completions/clipped_ratio": 0.15000000707805156,
523
+ "completions/max_length": 245.4,
524
+ "completions/max_terminated_length": 235.8,
525
+ "completions/mean_length": 145.20000457763672,
526
+ "completions/mean_terminated_length": 128.7294189453125,
527
+ "completions/min_length": 50.8,
528
+ "completions/min_terminated_length": 50.8,
529
+ "entropy": 1.2446547746658325,
530
+ "epoch": 0.19,
531
+ "frac_reward_zero_std": 0.0,
532
+ "grad_norm": 3.650216579437256,
533
+ "kl": 0.19710368812084197,
534
+ "learning_rate": 4.530000000000001e-06,
535
+ "loss": 0.1361,
536
+ "num_tokens": 1213177.0,
537
+ "reward": 1.3565361022949218,
538
+ "reward_std": 0.3620565414428711,
539
+ "rewards/politeness_reward_func/mean": 1.356536078453064,
540
+ "rewards/politeness_reward_func/std": 0.4186973810195923,
541
+ "step": 95,
542
+ "step_time": 10.10843729674816
543
+ },
544
+ {
545
+ "clip_ratio/high_max": 0.0,
546
+ "clip_ratio/high_mean": 0.0,
547
+ "clip_ratio/low_mean": 0.0,
548
+ "clip_ratio/low_min": 0.0,
549
+ "clip_ratio/region_mean": 0.0,
550
+ "completions/clipped_ratio": 0.15416666865348816,
551
+ "completions/max_length": 253.2,
552
+ "completions/max_terminated_length": 239.6,
553
+ "completions/mean_length": 127.69167022705078,
554
+ "completions/mean_terminated_length": 104.76427154541015,
555
+ "completions/min_length": 33.6,
556
+ "completions/min_terminated_length": 33.6,
557
+ "entropy": 1.21082746386528,
558
+ "epoch": 0.2,
559
+ "frac_reward_zero_std": 0.0,
560
+ "grad_norm": 4.778512477874756,
561
+ "kl": 0.23306063413619996,
562
+ "learning_rate": 4.505e-06,
563
+ "loss": 0.1579,
564
+ "num_tokens": 1261199.0,
565
+ "reward": 1.4624977350234984,
566
+ "reward_std": 0.27724905014038087,
567
+ "rewards/politeness_reward_func/mean": 1.4624976634979248,
568
+ "rewards/politeness_reward_func/std": 0.353417181968689,
569
+ "step": 100,
570
+ "step_time": 10.490663215517998
571
+ },
572
+ {
573
+ "clip_ratio/high_max": 0.0,
574
+ "clip_ratio/high_mean": 0.0,
575
+ "clip_ratio/low_mean": 0.0,
576
+ "clip_ratio/low_min": 0.0,
577
+ "clip_ratio/region_mean": 0.0,
578
+ "completions/clipped_ratio": 0.2916666746139526,
579
+ "completions/max_length": 256.0,
580
+ "completions/max_terminated_length": 239.8,
581
+ "completions/mean_length": 164.0000030517578,
582
+ "completions/mean_terminated_length": 131.73128814697264,
583
+ "completions/min_length": 52.0,
584
+ "completions/min_terminated_length": 52.0,
585
+ "entropy": 1.227693748474121,
586
+ "epoch": 0.21,
587
+ "frac_reward_zero_std": 0.0,
588
+ "grad_norm": 3.480128765106201,
589
+ "kl": 0.21182419136166572,
590
+ "learning_rate": 4.48e-06,
591
+ "loss": 0.1385,
592
+ "num_tokens": 1317887.0,
593
+ "reward": 1.4506232976913451,
594
+ "reward_std": 0.38838809728622437,
595
+ "rewards/politeness_reward_func/mean": 1.4506232500076295,
596
+ "rewards/politeness_reward_func/std": 0.415321010351181,
597
+ "step": 105,
598
+ "step_time": 10.568055501580238
599
+ },
600
+ {
601
+ "clip_ratio/high_max": 0.0,
602
+ "clip_ratio/high_mean": 0.0,
603
+ "clip_ratio/low_mean": 0.0,
604
+ "clip_ratio/low_min": 0.0,
605
+ "clip_ratio/region_mean": 0.0,
606
+ "completions/clipped_ratio": 0.2875000134110451,
607
+ "completions/max_length": 256.0,
608
+ "completions/max_terminated_length": 246.6,
609
+ "completions/mean_length": 164.9791717529297,
610
+ "completions/mean_terminated_length": 129.34453430175782,
611
+ "completions/min_length": 50.8,
612
+ "completions/min_terminated_length": 50.8,
613
+ "entropy": 1.2591872215270996,
614
+ "epoch": 0.22,
615
+ "frac_reward_zero_std": 0.0,
616
+ "grad_norm": 5.477797985076904,
617
+ "kl": 0.21972680687904358,
618
+ "learning_rate": 4.4550000000000005e-06,
619
+ "loss": 0.118,
620
+ "num_tokens": 1374570.0,
621
+ "reward": 1.3832386493682862,
622
+ "reward_std": 0.36756529808044436,
623
+ "rewards/politeness_reward_func/mean": 1.3832386016845704,
624
+ "rewards/politeness_reward_func/std": 0.45387519598007203,
625
+ "step": 110,
626
+ "step_time": 10.580036842823029
627
+ },
628
+ {
629
+ "clip_ratio/high_max": 0.0,
630
+ "clip_ratio/high_mean": 0.0,
631
+ "clip_ratio/low_mean": 0.0,
632
+ "clip_ratio/low_min": 0.0,
633
+ "clip_ratio/region_mean": 0.0,
634
+ "completions/clipped_ratio": 0.4041666805744171,
635
+ "completions/max_length": 256.0,
636
+ "completions/max_terminated_length": 239.6,
637
+ "completions/mean_length": 189.48334045410155,
638
+ "completions/mean_terminated_length": 144.77314147949218,
639
+ "completions/min_length": 55.6,
640
+ "completions/min_terminated_length": 55.6,
641
+ "entropy": 1.4673572778701782,
642
+ "epoch": 0.23,
643
+ "frac_reward_zero_std": 0.0,
644
+ "grad_norm": 4.5870041847229,
645
+ "kl": 0.18002651631832123,
646
+ "learning_rate": 4.430000000000001e-06,
647
+ "loss": 0.0902,
648
+ "num_tokens": 1437278.0,
649
+ "reward": 1.463021945953369,
650
+ "reward_std": 0.36559332013130186,
651
+ "rewards/politeness_reward_func/mean": 1.4630219221115113,
652
+ "rewards/politeness_reward_func/std": 0.4403323769569397,
653
+ "step": 115,
654
+ "step_time": 10.606720576435327
655
+ },
656
+ {
657
+ "clip_ratio/high_max": 0.0,
658
+ "clip_ratio/high_mean": 0.0,
659
+ "clip_ratio/low_mean": 0.0,
660
+ "clip_ratio/low_min": 0.0,
661
+ "clip_ratio/region_mean": 0.0,
662
+ "completions/clipped_ratio": 0.29166667759418485,
663
+ "completions/max_length": 256.0,
664
+ "completions/max_terminated_length": 250.4,
665
+ "completions/mean_length": 168.28750610351562,
666
+ "completions/mean_terminated_length": 134.1982864379883,
667
+ "completions/min_length": 45.0,
668
+ "completions/min_terminated_length": 45.0,
669
+ "entropy": 1.439770805835724,
670
+ "epoch": 0.24,
671
+ "frac_reward_zero_std": 0.0,
672
+ "grad_norm": 4.188452243804932,
673
+ "kl": 0.23004925400018691,
674
+ "learning_rate": 4.405e-06,
675
+ "loss": 0.1038,
676
+ "num_tokens": 1495123.0,
677
+ "reward": 1.6009358167648315,
678
+ "reward_std": 0.3070066928863525,
679
+ "rewards/politeness_reward_func/mean": 1.6009357929229737,
680
+ "rewards/politeness_reward_func/std": 0.34360362887382506,
681
+ "step": 120,
682
+ "step_time": 10.64953521117568
683
+ },
684
+ {
685
+ "clip_ratio/high_max": 0.0,
686
+ "clip_ratio/high_mean": 0.0,
687
+ "clip_ratio/low_mean": 0.0,
688
+ "clip_ratio/low_min": 0.0,
689
+ "clip_ratio/region_mean": 0.0,
690
+ "completions/clipped_ratio": 0.1833333395421505,
691
+ "completions/max_length": 256.0,
692
+ "completions/max_terminated_length": 243.2,
693
+ "completions/mean_length": 140.8666748046875,
694
+ "completions/mean_terminated_length": 116.34450073242188,
695
+ "completions/min_length": 37.8,
696
+ "completions/min_terminated_length": 37.8,
697
+ "entropy": 1.2707997798919677,
698
+ "epoch": 0.25,
699
+ "frac_reward_zero_std": 0.0,
700
+ "grad_norm": 4.1139140129089355,
701
+ "kl": 0.2768221229314804,
702
+ "learning_rate": 4.38e-06,
703
+ "loss": 0.146,
704
+ "num_tokens": 1546755.0,
705
+ "reward": 1.528564214706421,
706
+ "reward_std": 0.29817952811717985,
707
+ "rewards/politeness_reward_func/mean": 1.528564167022705,
708
+ "rewards/politeness_reward_func/std": 0.353902006149292,
709
+ "step": 125,
710
+ "step_time": 10.536190098524093
711
+ },
712
+ {
713
+ "clip_ratio/high_max": 0.0,
714
+ "clip_ratio/high_mean": 0.0,
715
+ "clip_ratio/low_mean": 0.0,
716
+ "clip_ratio/low_min": 0.0,
717
+ "clip_ratio/region_mean": 0.0,
718
+ "completions/clipped_ratio": 0.12500000298023223,
719
+ "completions/max_length": 256.0,
720
+ "completions/max_terminated_length": 228.6,
721
+ "completions/mean_length": 126.03333587646485,
722
+ "completions/mean_terminated_length": 108.7752197265625,
723
+ "completions/min_length": 37.6,
724
+ "completions/min_terminated_length": 37.6,
725
+ "entropy": 1.239863795042038,
726
+ "epoch": 0.26,
727
+ "frac_reward_zero_std": 0.0,
728
+ "grad_norm": 20.54998207092285,
729
+ "kl": 0.4258181616663933,
730
+ "learning_rate": 4.355000000000001e-06,
731
+ "loss": 0.205,
732
+ "num_tokens": 1594475.0,
733
+ "reward": 1.5162237882614136,
734
+ "reward_std": 0.37349834442138674,
735
+ "rewards/politeness_reward_func/mean": 1.5162237882614136,
736
+ "rewards/politeness_reward_func/std": 0.4025087893009186,
737
+ "step": 130,
738
+ "step_time": 10.561264203488827
739
+ },
740
+ {
741
+ "clip_ratio/high_max": 0.0,
742
+ "clip_ratio/high_mean": 0.0,
743
+ "clip_ratio/low_mean": 0.0,
744
+ "clip_ratio/low_min": 0.0,
745
+ "clip_ratio/region_mean": 0.0,
746
+ "completions/clipped_ratio": 0.1375000063329935,
747
+ "completions/max_length": 256.0,
748
+ "completions/max_terminated_length": 235.4,
749
+ "completions/mean_length": 121.34167175292968,
750
+ "completions/mean_terminated_length": 100.33185272216797,
751
+ "completions/min_length": 29.0,
752
+ "completions/min_terminated_length": 29.0,
753
+ "entropy": 1.44633287191391,
754
+ "epoch": 0.27,
755
+ "frac_reward_zero_std": 0.0,
756
+ "grad_norm": 4.6437859535217285,
757
+ "kl": 0.2876953512430191,
758
+ "learning_rate": 4.33e-06,
759
+ "loss": 0.1474,
760
+ "num_tokens": 1640909.0,
761
+ "reward": 1.6570862054824829,
762
+ "reward_std": 0.35202938318252563,
763
+ "rewards/politeness_reward_func/mean": 1.657086157798767,
764
+ "rewards/politeness_reward_func/std": 0.4086485207080841,
765
+ "step": 135,
766
+ "step_time": 10.50367113724351
767
+ },
768
+ {
769
+ "clip_ratio/high_max": 0.0,
770
+ "clip_ratio/high_mean": 0.0,
771
+ "clip_ratio/low_mean": 0.0,
772
+ "clip_ratio/low_min": 0.0,
773
+ "clip_ratio/region_mean": 0.0,
774
+ "completions/clipped_ratio": 0.25000000596046446,
775
+ "completions/max_length": 256.0,
776
+ "completions/max_terminated_length": 232.4,
777
+ "completions/mean_length": 165.46250610351564,
778
+ "completions/mean_terminated_length": 134.87125549316406,
779
+ "completions/min_length": 50.2,
780
+ "completions/min_terminated_length": 50.2,
781
+ "entropy": 1.5645981192588807,
782
+ "epoch": 0.28,
783
+ "frac_reward_zero_std": 0.0,
784
+ "grad_norm": 4.930655479431152,
785
+ "kl": 0.20734833329916,
786
+ "learning_rate": 4.305e-06,
787
+ "loss": 0.1052,
788
+ "num_tokens": 1698396.0,
789
+ "reward": 1.5796484470367431,
790
+ "reward_std": 0.39563443660736086,
791
+ "rewards/politeness_reward_func/mean": 1.5796483516693116,
792
+ "rewards/politeness_reward_func/std": 0.43083653450012205,
793
+ "step": 140,
794
+ "step_time": 10.580496142059564
795
+ },
796
+ {
797
+ "clip_ratio/high_max": 0.0,
798
+ "clip_ratio/high_mean": 0.0,
799
+ "clip_ratio/low_mean": 0.0,
800
+ "clip_ratio/low_min": 0.0,
801
+ "clip_ratio/region_mean": 0.0,
802
+ "completions/clipped_ratio": 0.3666666731238365,
803
+ "completions/max_length": 256.0,
804
+ "completions/max_terminated_length": 245.4,
805
+ "completions/mean_length": 182.1541717529297,
806
+ "completions/mean_terminated_length": 148.0070785522461,
807
+ "completions/min_length": 67.8,
808
+ "completions/min_terminated_length": 67.8,
809
+ "entropy": 1.4628460884094239,
810
+ "epoch": 0.29,
811
+ "frac_reward_zero_std": 0.0,
812
+ "grad_norm": 3.1963322162628174,
813
+ "kl": 0.20285871326923371,
814
+ "learning_rate": 4.2800000000000005e-06,
815
+ "loss": 0.0054,
816
+ "num_tokens": 1759217.0,
817
+ "reward": 1.5254708528518677,
818
+ "reward_std": 0.3578204929828644,
819
+ "rewards/politeness_reward_func/mean": 1.5254708051681518,
820
+ "rewards/politeness_reward_func/std": 0.39698067903518675,
821
+ "step": 145,
822
+ "step_time": 10.641796179115772
823
+ },
824
+ {
825
+ "clip_ratio/high_max": 0.0,
826
+ "clip_ratio/high_mean": 0.0,
827
+ "clip_ratio/low_mean": 0.0,
828
+ "clip_ratio/low_min": 0.0,
829
+ "clip_ratio/region_mean": 0.0,
830
+ "completions/clipped_ratio": 0.325000011920929,
831
+ "completions/max_length": 256.0,
832
+ "completions/max_terminated_length": 243.8,
833
+ "completions/mean_length": 185.4041717529297,
834
+ "completions/mean_terminated_length": 151.72132263183593,
835
+ "completions/min_length": 54.2,
836
+ "completions/min_terminated_length": 54.2,
837
+ "entropy": 1.550300669670105,
838
+ "epoch": 0.3,
839
+ "frac_reward_zero_std": 0.0,
840
+ "grad_norm": 3.5268337726593018,
841
+ "kl": 0.26886349618434907,
842
+ "learning_rate": 4.255e-06,
843
+ "loss": 0.0341,
844
+ "num_tokens": 1821250.0,
845
+ "reward": 1.5432605028152466,
846
+ "reward_std": 0.35273998975753784,
847
+ "rewards/politeness_reward_func/mean": 1.5432604551315308,
848
+ "rewards/politeness_reward_func/std": 0.39439594745635986,
849
+ "step": 150,
850
+ "step_time": 10.68586125895381
851
+ },
852
+ {
853
+ "clip_ratio/high_max": 0.0,
854
+ "clip_ratio/high_mean": 0.0,
855
+ "clip_ratio/low_mean": 0.0,
856
+ "clip_ratio/low_min": 0.0,
857
+ "clip_ratio/region_mean": 0.0,
858
+ "completions/clipped_ratio": 0.20416667610406875,
859
+ "completions/max_length": 256.0,
860
+ "completions/max_terminated_length": 241.8,
861
+ "completions/mean_length": 162.42083740234375,
862
+ "completions/mean_terminated_length": 139.60369567871095,
863
+ "completions/min_length": 58.0,
864
+ "completions/min_terminated_length": 58.0,
865
+ "entropy": 1.665429162979126,
866
+ "epoch": 0.31,
867
+ "frac_reward_zero_std": 0.0,
868
+ "grad_norm": 3.6844170093536377,
869
+ "kl": 0.22049252390861512,
870
+ "learning_rate": 4.23e-06,
871
+ "loss": 0.0256,
872
+ "num_tokens": 1877383.0,
873
+ "reward": 1.6982928276062013,
874
+ "reward_std": 0.30497619807720183,
875
+ "rewards/politeness_reward_func/mean": 1.6982927560806274,
876
+ "rewards/politeness_reward_func/std": 0.3323679566383362,
877
+ "step": 155,
878
+ "step_time": 10.646523833274841
879
+ },
880
+ {
881
+ "clip_ratio/high_max": 0.0,
882
+ "clip_ratio/high_mean": 0.0,
883
+ "clip_ratio/low_mean": 0.0,
884
+ "clip_ratio/low_min": 0.0,
885
+ "clip_ratio/region_mean": 0.0,
886
+ "completions/clipped_ratio": 0.12083333805203438,
887
+ "completions/max_length": 256.0,
888
+ "completions/max_terminated_length": 238.4,
889
+ "completions/mean_length": 141.9625045776367,
890
+ "completions/mean_terminated_length": 126.35040893554688,
891
+ "completions/min_length": 53.0,
892
+ "completions/min_terminated_length": 53.0,
893
+ "entropy": 1.493414855003357,
894
+ "epoch": 0.32,
895
+ "frac_reward_zero_std": 0.0,
896
+ "grad_norm": 5.583273410797119,
897
+ "kl": 0.25222020596265793,
898
+ "learning_rate": 4.205e-06,
899
+ "loss": 0.0511,
900
+ "num_tokens": 1928318.0,
901
+ "reward": 1.6409826755523682,
902
+ "reward_std": 0.2959387719631195,
903
+ "rewards/politeness_reward_func/mean": 1.6409826040267945,
904
+ "rewards/politeness_reward_func/std": 0.3353587478399277,
905
+ "step": 160,
906
+ "step_time": 10.597700411826372
907
+ },
908
+ {
909
+ "clip_ratio/high_max": 0.0,
910
+ "clip_ratio/high_mean": 0.0,
911
+ "clip_ratio/low_mean": 0.0,
912
+ "clip_ratio/low_min": 0.0,
913
+ "clip_ratio/region_mean": 0.0,
914
+ "completions/clipped_ratio": 0.22916667088866233,
915
+ "completions/max_length": 256.0,
916
+ "completions/max_terminated_length": 248.2,
917
+ "completions/mean_length": 159.16250762939453,
918
+ "completions/mean_terminated_length": 130.86475067138673,
919
+ "completions/min_length": 52.4,
920
+ "completions/min_terminated_length": 52.4,
921
+ "entropy": 1.5048229694366455,
922
+ "epoch": 0.33,
923
+ "frac_reward_zero_std": 0.0,
924
+ "grad_norm": 8.112273216247559,
925
+ "kl": 0.26160909086465833,
926
+ "learning_rate": 4.18e-06,
927
+ "loss": 0.1583,
928
+ "num_tokens": 1983861.0,
929
+ "reward": 1.5394394874572754,
930
+ "reward_std": 0.4292898178100586,
931
+ "rewards/politeness_reward_func/mean": 1.5394394636154174,
932
+ "rewards/politeness_reward_func/std": 0.4557326793670654,
933
+ "step": 165,
934
+ "step_time": 10.650840406119823
935
+ },
936
+ {
937
+ "clip_ratio/high_max": 0.0,
938
+ "clip_ratio/high_mean": 0.0,
939
+ "clip_ratio/low_mean": 0.0,
940
+ "clip_ratio/low_min": 0.0,
941
+ "clip_ratio/region_mean": 0.0,
942
+ "completions/clipped_ratio": 0.3583333373069763,
943
+ "completions/max_length": 256.0,
944
+ "completions/max_terminated_length": 249.8,
945
+ "completions/mean_length": 181.42084045410155,
946
+ "completions/mean_terminated_length": 142.14824829101562,
947
+ "completions/min_length": 47.6,
948
+ "completions/min_terminated_length": 47.6,
949
+ "entropy": 1.6774910688400269,
950
+ "epoch": 0.34,
951
+ "frac_reward_zero_std": 0.0,
952
+ "grad_norm": 5.154572486877441,
953
+ "kl": 0.21350446194410325,
954
+ "learning_rate": 4.155e-06,
955
+ "loss": 0.0845,
956
+ "num_tokens": 2044922.0,
957
+ "reward": 1.617796039581299,
958
+ "reward_std": 0.42567009329795835,
959
+ "rewards/politeness_reward_func/mean": 1.617796039581299,
960
+ "rewards/politeness_reward_func/std": 0.5039380311965942,
961
+ "step": 170,
962
+ "step_time": 10.696013970673084
963
+ },
964
+ {
965
+ "clip_ratio/high_max": 0.0,
966
+ "clip_ratio/high_mean": 0.0,
967
+ "clip_ratio/low_mean": 0.0,
968
+ "clip_ratio/low_min": 0.0,
969
+ "clip_ratio/region_mean": 0.0,
970
+ "completions/clipped_ratio": 0.21666667759418487,
971
+ "completions/max_length": 256.0,
972
+ "completions/max_terminated_length": 235.8,
973
+ "completions/mean_length": 150.75000610351563,
974
+ "completions/mean_terminated_length": 122.61416320800781,
975
+ "completions/min_length": 41.4,
976
+ "completions/min_terminated_length": 41.4,
977
+ "entropy": 1.7118729948997498,
978
+ "epoch": 0.35,
979
+ "frac_reward_zero_std": 0.0,
980
+ "grad_norm": 4.096772193908691,
981
+ "kl": 0.24443377554416656,
982
+ "learning_rate": 4.13e-06,
983
+ "loss": 0.062,
984
+ "num_tokens": 2098654.0,
985
+ "reward": 1.7112194776535035,
986
+ "reward_std": 0.41009202003479006,
987
+ "rewards/politeness_reward_func/mean": 1.7112194299697876,
988
+ "rewards/politeness_reward_func/std": 0.46850563287734986,
989
+ "step": 175,
990
+ "step_time": 10.681096732616425
991
+ },
992
+ {
993
+ "clip_ratio/high_max": 0.0,
994
+ "clip_ratio/high_mean": 0.0,
995
+ "clip_ratio/low_mean": 0.0,
996
+ "clip_ratio/low_min": 0.0,
997
+ "clip_ratio/region_mean": 0.0,
998
+ "completions/clipped_ratio": 0.24583333544433117,
999
+ "completions/max_length": 256.0,
1000
+ "completions/max_terminated_length": 250.4,
1001
+ "completions/mean_length": 161.39583892822264,
1002
+ "completions/mean_terminated_length": 134.3897247314453,
1003
+ "completions/min_length": 56.0,
1004
+ "completions/min_terminated_length": 56.0,
1005
+ "entropy": 1.6281691670417786,
1006
+ "epoch": 0.36,
1007
+ "frac_reward_zero_std": 0.0,
1008
+ "grad_norm": 39.266761779785156,
1009
+ "kl": 0.3168014049530029,
1010
+ "learning_rate": 4.1050000000000005e-06,
1011
+ "loss": 0.0628,
1012
+ "num_tokens": 2154557.0,
1013
+ "reward": 1.592983651161194,
1014
+ "reward_std": 0.4036149501800537,
1015
+ "rewards/politeness_reward_func/mean": 1.5929836750030517,
1016
+ "rewards/politeness_reward_func/std": 0.4594584465026855,
1017
+ "step": 180,
1018
+ "step_time": 10.576303614675998
1019
+ },
1020
+ {
1021
+ "clip_ratio/high_max": 0.0,
1022
+ "clip_ratio/high_mean": 0.0,
1023
+ "clip_ratio/low_mean": 0.0,
1024
+ "clip_ratio/low_min": 0.0,
1025
+ "clip_ratio/region_mean": 0.0,
1026
+ "completions/clipped_ratio": 0.2041666716337204,
1027
+ "completions/max_length": 256.0,
1028
+ "completions/max_terminated_length": 248.0,
1029
+ "completions/mean_length": 154.63750305175782,
1030
+ "completions/mean_terminated_length": 128.9908935546875,
1031
+ "completions/min_length": 47.2,
1032
+ "completions/min_terminated_length": 47.2,
1033
+ "entropy": 1.5905902862548829,
1034
+ "epoch": 0.37,
1035
+ "frac_reward_zero_std": 0.0,
1036
+ "grad_norm": 3.8977794647216797,
1037
+ "kl": 0.2617978900671005,
1038
+ "learning_rate": 4.08e-06,
1039
+ "loss": 0.0605,
1040
+ "num_tokens": 2208870.0,
1041
+ "reward": 1.608209228515625,
1042
+ "reward_std": 0.40417273342609406,
1043
+ "rewards/politeness_reward_func/mean": 1.608209204673767,
1044
+ "rewards/politeness_reward_func/std": 0.44712826013565066,
1045
+ "step": 185,
1046
+ "step_time": 10.657971718162297
1047
+ },
1048
+ {
1049
+ "clip_ratio/high_max": 0.0,
1050
+ "clip_ratio/high_mean": 0.0,
1051
+ "clip_ratio/low_mean": 0.0,
1052
+ "clip_ratio/low_min": 0.0,
1053
+ "clip_ratio/region_mean": 0.0,
1054
+ "completions/clipped_ratio": 0.25416666865348814,
1055
+ "completions/max_length": 256.0,
1056
+ "completions/max_terminated_length": 243.6,
1057
+ "completions/mean_length": 167.1416748046875,
1058
+ "completions/mean_terminated_length": 136.75882873535156,
1059
+ "completions/min_length": 46.6,
1060
+ "completions/min_terminated_length": 46.6,
1061
+ "entropy": 1.76783127784729,
1062
+ "epoch": 0.38,
1063
+ "frac_reward_zero_std": 0.0,
1064
+ "grad_norm": 5.072940826416016,
1065
+ "kl": 0.29045032411813737,
1066
+ "learning_rate": 4.055000000000001e-06,
1067
+ "loss": 0.0313,
1068
+ "num_tokens": 2266296.0,
1069
+ "reward": 1.7399524927139283,
1070
+ "reward_std": 0.4326803207397461,
1071
+ "rewards/politeness_reward_func/mean": 1.7399524688720702,
1072
+ "rewards/politeness_reward_func/std": 0.44698241353034973,
1073
+ "step": 190,
1074
+ "step_time": 10.595029101520776
1075
+ },
1076
+ {
1077
+ "clip_ratio/high_max": 0.0,
1078
+ "clip_ratio/high_mean": 0.0,
1079
+ "clip_ratio/low_mean": 0.0,
1080
+ "clip_ratio/low_min": 0.0,
1081
+ "clip_ratio/region_mean": 0.0,
1082
+ "completions/clipped_ratio": 0.387500011920929,
1083
+ "completions/max_length": 256.0,
1084
+ "completions/max_terminated_length": 249.8,
1085
+ "completions/mean_length": 190.1125030517578,
1086
+ "completions/mean_terminated_length": 150.54798278808593,
1087
+ "completions/min_length": 57.8,
1088
+ "completions/min_terminated_length": 57.8,
1089
+ "entropy": 1.7596600532531739,
1090
+ "epoch": 0.39,
1091
+ "frac_reward_zero_std": 0.0,
1092
+ "grad_norm": 4.128026008605957,
1093
+ "kl": 0.25340984016656876,
1094
+ "learning_rate": 4.03e-06,
1095
+ "loss": 0.0384,
1096
+ "num_tokens": 2329619.0,
1097
+ "reward": 1.6809997797012328,
1098
+ "reward_std": 0.4062549531459808,
1099
+ "rewards/politeness_reward_func/mean": 1.6809997320175172,
1100
+ "rewards/politeness_reward_func/std": 0.44869658946990965,
1101
+ "step": 195,
1102
+ "step_time": 10.63182978257537
1103
+ },
1104
+ {
1105
+ "clip_ratio/high_max": 0.0,
1106
+ "clip_ratio/high_mean": 0.0,
1107
+ "clip_ratio/low_mean": 0.0,
1108
+ "clip_ratio/low_min": 0.0,
1109
+ "clip_ratio/region_mean": 0.0,
1110
+ "completions/clipped_ratio": 0.320833346247673,
1111
+ "completions/max_length": 256.0,
1112
+ "completions/max_terminated_length": 250.8,
1113
+ "completions/mean_length": 182.68334045410157,
1114
+ "completions/mean_terminated_length": 149.52076721191406,
1115
+ "completions/min_length": 60.2,
1116
+ "completions/min_terminated_length": 60.2,
1117
+ "entropy": 1.7882241249084472,
1118
+ "epoch": 0.4,
1119
+ "frac_reward_zero_std": 0.0,
1120
+ "grad_norm": 4.352699279785156,
1121
+ "kl": 0.2817923933267593,
1122
+ "learning_rate": 4.005000000000001e-06,
1123
+ "loss": -0.0338,
1124
+ "num_tokens": 2390743.0,
1125
+ "reward": 1.7947330713272094,
1126
+ "reward_std": 0.3602103054523468,
1127
+ "rewards/politeness_reward_func/mean": 1.7947329998016357,
1128
+ "rewards/politeness_reward_func/std": 0.37249717116355896,
1129
+ "step": 200,
1130
+ "step_time": 10.771137349307537
1131
+ },
1132
+ {
1133
+ "clip_ratio/high_max": 0.0,
1134
+ "clip_ratio/high_mean": 0.0,
1135
+ "clip_ratio/low_mean": 0.0,
1136
+ "clip_ratio/low_min": 0.0,
1137
+ "clip_ratio/region_mean": 0.0,
1138
+ "completions/clipped_ratio": 0.279166679084301,
1139
+ "completions/max_length": 256.0,
1140
+ "completions/max_terminated_length": 251.0,
1141
+ "completions/mean_length": 175.2791717529297,
1142
+ "completions/mean_terminated_length": 146.66289672851562,
1143
+ "completions/min_length": 54.2,
1144
+ "completions/min_terminated_length": 54.2,
1145
+ "entropy": 1.805497646331787,
1146
+ "epoch": 0.41,
1147
+ "frac_reward_zero_std": 0.0,
1148
+ "grad_norm": 6.236753463745117,
1149
+ "kl": 0.4257870987057686,
1150
+ "learning_rate": 3.980000000000001e-06,
1151
+ "loss": 0.0117,
1152
+ "num_tokens": 2449866.0,
1153
+ "reward": 1.7829206705093383,
1154
+ "reward_std": 0.4116291403770447,
1155
+ "rewards/politeness_reward_func/mean": 1.7829206705093383,
1156
+ "rewards/politeness_reward_func/std": 0.43859742283821107,
1157
+ "step": 205,
1158
+ "step_time": 10.78743917644024
1159
+ },
1160
+ {
1161
+ "clip_ratio/high_max": 0.0,
1162
+ "clip_ratio/high_mean": 0.0,
1163
+ "clip_ratio/low_mean": 0.0,
1164
+ "clip_ratio/low_min": 0.0,
1165
+ "clip_ratio/region_mean": 0.0,
1166
+ "completions/clipped_ratio": 0.254166679084301,
1167
+ "completions/max_length": 256.0,
1168
+ "completions/max_terminated_length": 248.2,
1169
+ "completions/mean_length": 171.39584045410157,
1170
+ "completions/mean_terminated_length": 143.78868103027344,
1171
+ "completions/min_length": 52.0,
1172
+ "completions/min_terminated_length": 52.0,
1173
+ "entropy": 1.6819992780685424,
1174
+ "epoch": 0.42,
1175
+ "frac_reward_zero_std": 0.0,
1176
+ "grad_norm": 4.016842842102051,
1177
+ "kl": 0.2606677010655403,
1178
+ "learning_rate": 3.955e-06,
1179
+ "loss": 0.0507,
1180
+ "num_tokens": 2507913.0,
1181
+ "reward": 1.6511780500411988,
1182
+ "reward_std": 0.4131308376789093,
1183
+ "rewards/politeness_reward_func/mean": 1.651178002357483,
1184
+ "rewards/politeness_reward_func/std": 0.43051646947860717,
1185
+ "step": 210,
1186
+ "step_time": 10.86044239550829
1187
+ },
1188
+ {
1189
+ "clip_ratio/high_max": 0.0,
1190
+ "clip_ratio/high_mean": 0.0,
1191
+ "clip_ratio/low_mean": 0.0,
1192
+ "clip_ratio/low_min": 0.0,
1193
+ "clip_ratio/region_mean": 0.0,
1194
+ "completions/clipped_ratio": 0.2250000059604645,
1195
+ "completions/max_length": 256.0,
1196
+ "completions/max_terminated_length": 249.2,
1197
+ "completions/mean_length": 166.20833740234374,
1198
+ "completions/mean_terminated_length": 139.74808349609376,
1199
+ "completions/min_length": 58.8,
1200
+ "completions/min_terminated_length": 58.8,
1201
+ "entropy": 1.5043580889701844,
1202
+ "epoch": 0.43,
1203
+ "frac_reward_zero_std": 0.0,
1204
+ "grad_norm": 3.5619537830352783,
1205
+ "kl": 0.24042100906372071,
1206
+ "learning_rate": 3.9300000000000005e-06,
1207
+ "loss": -0.0266,
1208
+ "num_tokens": 2565323.0,
1209
+ "reward": 1.6339158296585083,
1210
+ "reward_std": 0.33398547768592834,
1211
+ "rewards/politeness_reward_func/mean": 1.6339158535003662,
1212
+ "rewards/politeness_reward_func/std": 0.37013601660728457,
1213
+ "step": 215,
1214
+ "step_time": 10.860479059070348
1215
+ },
1216
+ {
1217
+ "clip_ratio/high_max": 0.0,
1218
+ "clip_ratio/high_mean": 0.0,
1219
+ "clip_ratio/low_mean": 0.0,
1220
+ "clip_ratio/low_min": 0.0,
1221
+ "clip_ratio/region_mean": 0.0,
1222
+ "completions/clipped_ratio": 0.2291666731238365,
1223
+ "completions/max_length": 256.0,
1224
+ "completions/max_terminated_length": 238.6,
1225
+ "completions/mean_length": 162.5666717529297,
1226
+ "completions/mean_terminated_length": 137.25745391845703,
1227
+ "completions/min_length": 52.4,
1228
+ "completions/min_terminated_length": 52.4,
1229
+ "entropy": 1.707606041431427,
1230
+ "epoch": 0.44,
1231
+ "frac_reward_zero_std": 0.0,
1232
+ "grad_norm": 4.356007099151611,
1233
+ "kl": 0.275185127556324,
1234
+ "learning_rate": 3.905000000000001e-06,
1235
+ "loss": 0.0385,
1236
+ "num_tokens": 2621347.0,
1237
+ "reward": 1.7018348932266236,
1238
+ "reward_std": 0.3872075915336609,
1239
+ "rewards/politeness_reward_func/mean": 1.7018348217010497,
1240
+ "rewards/politeness_reward_func/std": 0.43171402215957644,
1241
+ "step": 220,
1242
+ "step_time": 10.862962251901626
1243
+ },
1244
+ {
1245
+ "clip_ratio/high_max": 0.0,
1246
+ "clip_ratio/high_mean": 0.0,
1247
+ "clip_ratio/low_mean": 0.0,
1248
+ "clip_ratio/low_min": 0.0,
1249
+ "clip_ratio/region_mean": 0.0,
1250
+ "completions/clipped_ratio": 0.25000000447034837,
1251
+ "completions/max_length": 256.0,
1252
+ "completions/max_terminated_length": 247.6,
1253
+ "completions/mean_length": 170.00000610351563,
1254
+ "completions/mean_terminated_length": 142.47680053710937,
1255
+ "completions/min_length": 51.4,
1256
+ "completions/min_terminated_length": 51.4,
1257
+ "entropy": 1.7443210363388062,
1258
+ "epoch": 0.45,
1259
+ "frac_reward_zero_std": 0.0,
1260
+ "grad_norm": 4.761515140533447,
1261
+ "kl": 0.2994952619075775,
1262
+ "learning_rate": 3.88e-06,
1263
+ "loss": 0.0254,
1264
+ "num_tokens": 2679443.0,
1265
+ "reward": 1.6937603950500488,
1266
+ "reward_std": 0.42128287851810453,
1267
+ "rewards/politeness_reward_func/mean": 1.6937604188919066,
1268
+ "rewards/politeness_reward_func/std": 0.4645624727010727,
1269
+ "step": 225,
1270
+ "step_time": 10.798750822246074
1271
+ },
1272
+ {
1273
+ "clip_ratio/high_max": 0.0,
1274
+ "clip_ratio/high_mean": 0.0,
1275
+ "clip_ratio/low_mean": 0.0,
1276
+ "clip_ratio/low_min": 0.0,
1277
+ "clip_ratio/region_mean": 0.0,
1278
+ "completions/clipped_ratio": 0.2750000089406967,
1279
+ "completions/max_length": 256.0,
1280
+ "completions/max_terminated_length": 238.4,
1281
+ "completions/mean_length": 171.07500305175782,
1282
+ "completions/mean_terminated_length": 139.3609603881836,
1283
+ "completions/min_length": 51.4,
1284
+ "completions/min_terminated_length": 51.4,
1285
+ "entropy": 1.7655636191368103,
1286
+ "epoch": 0.46,
1287
+ "frac_reward_zero_std": 0.0,
1288
+ "grad_norm": 4.219749450683594,
1289
+ "kl": 0.30960221886634826,
1290
+ "learning_rate": 3.855e-06,
1291
+ "loss": 0.0374,
1292
+ "num_tokens": 2738565.0,
1293
+ "reward": 1.74673752784729,
1294
+ "reward_std": 0.49648854732513426,
1295
+ "rewards/politeness_reward_func/mean": 1.7467374801635742,
1296
+ "rewards/politeness_reward_func/std": 0.5134409368038177,
1297
+ "step": 230,
1298
+ "step_time": 10.840561749786138
1299
+ },
1300
+ {
1301
+ "clip_ratio/high_max": 0.0,
1302
+ "clip_ratio/high_mean": 0.0,
1303
+ "clip_ratio/low_mean": 0.0,
1304
+ "clip_ratio/low_min": 0.0,
1305
+ "clip_ratio/region_mean": 0.0,
1306
+ "completions/clipped_ratio": 0.24166667759418486,
1307
+ "completions/max_length": 256.0,
1308
+ "completions/max_terminated_length": 249.4,
1309
+ "completions/mean_length": 170.58750610351564,
1310
+ "completions/mean_terminated_length": 144.16094665527345,
1311
+ "completions/min_length": 49.6,
1312
+ "completions/min_terminated_length": 49.6,
1313
+ "entropy": 1.5835796117782592,
1314
+ "epoch": 0.47,
1315
+ "frac_reward_zero_std": 0.0,
1316
+ "grad_norm": 8.280583381652832,
1317
+ "kl": 0.28185769021511076,
1318
+ "learning_rate": 3.830000000000001e-06,
1319
+ "loss": 0.0433,
1320
+ "num_tokens": 2797282.0,
1321
+ "reward": 1.6517542362213136,
1322
+ "reward_std": 0.4238882720470428,
1323
+ "rewards/politeness_reward_func/mean": 1.6517542362213136,
1324
+ "rewards/politeness_reward_func/std": 0.46576651334762575,
1325
+ "step": 235,
1326
+ "step_time": 10.934168311953545
1327
+ },
1328
+ {
1329
+ "clip_ratio/high_max": 0.0,
1330
+ "clip_ratio/high_mean": 0.0,
1331
+ "clip_ratio/low_mean": 0.0,
1332
+ "clip_ratio/low_min": 0.0,
1333
+ "clip_ratio/region_mean": 0.0,
1334
+ "completions/clipped_ratio": 0.1833333358168602,
1335
+ "completions/max_length": 256.0,
1336
+ "completions/max_terminated_length": 248.4,
1337
+ "completions/mean_length": 149.59166870117187,
1338
+ "completions/mean_terminated_length": 125.1296600341797,
1339
+ "completions/min_length": 40.4,
1340
+ "completions/min_terminated_length": 40.4,
1341
+ "entropy": 1.5539511442184448,
1342
+ "epoch": 0.48,
1343
+ "frac_reward_zero_std": 0.0,
1344
+ "grad_norm": 3.9952290058135986,
1345
+ "kl": 0.2962296575307846,
1346
+ "learning_rate": 3.8050000000000004e-06,
1347
+ "loss": 0.0772,
1348
+ "num_tokens": 2850656.0,
1349
+ "reward": 1.5875454187393188,
1350
+ "reward_std": 0.33082354068756104,
1351
+ "rewards/politeness_reward_func/mean": 1.5875454187393188,
1352
+ "rewards/politeness_reward_func/std": 0.4173043370246887,
1353
+ "step": 240,
1354
+ "step_time": 10.826450176537037
1355
+ },
1356
+ {
1357
+ "clip_ratio/high_max": 0.0,
1358
+ "clip_ratio/high_mean": 0.0,
1359
+ "clip_ratio/low_mean": 0.0,
1360
+ "clip_ratio/low_min": 0.0,
1361
+ "clip_ratio/region_mean": 0.0,
1362
+ "completions/clipped_ratio": 0.42500001192092896,
1363
+ "completions/max_length": 256.0,
1364
+ "completions/max_terminated_length": 252.4,
1365
+ "completions/mean_length": 203.55417175292968,
1366
+ "completions/mean_terminated_length": 167.60662841796875,
1367
+ "completions/min_length": 68.0,
1368
+ "completions/min_terminated_length": 68.0,
1369
+ "entropy": 1.732195258140564,
1370
+ "epoch": 0.49,
1371
+ "frac_reward_zero_std": 0.0,
1372
+ "grad_norm": 4.070302963256836,
1373
+ "kl": 0.2232723578810692,
1374
+ "learning_rate": 3.7800000000000002e-06,
1375
+ "loss": 0.0646,
1376
+ "num_tokens": 2916917.0,
1377
+ "reward": 1.5489000082015991,
1378
+ "reward_std": 0.44896683692932127,
1379
+ "rewards/politeness_reward_func/mean": 1.5488999366760254,
1380
+ "rewards/politeness_reward_func/std": 0.4965748846530914,
1381
+ "step": 245,
1382
+ "step_time": 10.93318238928914
1383
+ },
1384
+ {
1385
+ "clip_ratio/high_max": 0.0,
1386
+ "clip_ratio/high_mean": 0.0,
1387
+ "clip_ratio/low_mean": 0.0,
1388
+ "clip_ratio/low_min": 0.0,
1389
+ "clip_ratio/region_mean": 0.0,
1390
+ "completions/clipped_ratio": 0.3375000059604645,
1391
+ "completions/max_length": 256.0,
1392
+ "completions/max_terminated_length": 250.6,
1393
+ "completions/mean_length": 175.0666717529297,
1394
+ "completions/mean_terminated_length": 135.0446517944336,
1395
+ "completions/min_length": 47.4,
1396
+ "completions/min_terminated_length": 47.4,
1397
+ "entropy": 1.7236797571182252,
1398
+ "epoch": 0.5,
1399
+ "frac_reward_zero_std": 0.0,
1400
+ "grad_norm": 15.248210906982422,
1401
+ "kl": 0.31031118631362914,
1402
+ "learning_rate": 3.7550000000000005e-06,
1403
+ "loss": 0.0402,
1404
+ "num_tokens": 2976245.0,
1405
+ "reward": 1.6219825506210328,
1406
+ "reward_std": 0.4284421443939209,
1407
+ "rewards/politeness_reward_func/mean": 1.6219825744628906,
1408
+ "rewards/politeness_reward_func/std": 0.4772031903266907,
1409
+ "step": 250,
1410
+ "step_time": 10.9129977889359
1411
+ },
1412
+ {
1413
+ "clip_ratio/high_max": 0.0,
1414
+ "clip_ratio/high_mean": 0.0,
1415
+ "clip_ratio/low_mean": 0.0,
1416
+ "clip_ratio/low_min": 0.0,
1417
+ "clip_ratio/region_mean": 0.0,
1418
+ "completions/clipped_ratio": 0.420833358168602,
1419
+ "completions/max_length": 256.0,
1420
+ "completions/max_terminated_length": 251.8,
1421
+ "completions/mean_length": 198.55000915527344,
1422
+ "completions/mean_terminated_length": 162.55760955810547,
1423
+ "completions/min_length": 71.8,
1424
+ "completions/min_terminated_length": 71.8,
1425
+ "entropy": 1.9296841025352478,
1426
+ "epoch": 0.51,
1427
+ "frac_reward_zero_std": 0.0,
1428
+ "grad_norm": 3.592284679412842,
1429
+ "kl": 0.25315294712781905,
1430
+ "learning_rate": 3.7300000000000003e-06,
1431
+ "loss": 0.0132,
1432
+ "num_tokens": 3041337.0,
1433
+ "reward": 1.6332777500152589,
1434
+ "reward_std": 0.4870866537094116,
1435
+ "rewards/politeness_reward_func/mean": 1.6332777261734008,
1436
+ "rewards/politeness_reward_func/std": 0.5504487872123718,
1437
+ "step": 255,
1438
+ "step_time": 10.853592294454575
1439
+ },
1440
+ {
1441
+ "clip_ratio/high_max": 0.0,
1442
+ "clip_ratio/high_mean": 0.0,
1443
+ "clip_ratio/low_mean": 0.0,
1444
+ "clip_ratio/low_min": 0.0,
1445
+ "clip_ratio/region_mean": 0.0,
1446
+ "completions/clipped_ratio": 0.32083334028720856,
1447
+ "completions/max_length": 256.0,
1448
+ "completions/max_terminated_length": 249.0,
1449
+ "completions/mean_length": 184.1916717529297,
1450
+ "completions/mean_terminated_length": 150.5311309814453,
1451
+ "completions/min_length": 62.8,
1452
+ "completions/min_terminated_length": 62.8,
1453
+ "entropy": 1.8794168829917908,
1454
+ "epoch": 0.52,
1455
+ "frac_reward_zero_std": 0.0,
1456
+ "grad_norm": 4.218177795410156,
1457
+ "kl": 0.24962413907051087,
1458
+ "learning_rate": 3.705e-06,
1459
+ "loss": 0.058,
1460
+ "num_tokens": 3102695.0,
1461
+ "reward": 1.7052141666412353,
1462
+ "reward_std": 0.4909538745880127,
1463
+ "rewards/politeness_reward_func/mean": 1.7052141666412353,
1464
+ "rewards/politeness_reward_func/std": 0.5124517560005188,
1465
+ "step": 260,
1466
+ "step_time": 10.939885137230158
1467
+ },
1468
+ {
1469
+ "clip_ratio/high_max": 0.0,
1470
+ "clip_ratio/high_mean": 0.0,
1471
+ "clip_ratio/low_mean": 0.0,
1472
+ "clip_ratio/low_min": 0.0,
1473
+ "clip_ratio/region_mean": 0.0,
1474
+ "completions/clipped_ratio": 0.39166667461395266,
1475
+ "completions/max_length": 256.0,
1476
+ "completions/max_terminated_length": 240.8,
1477
+ "completions/mean_length": 182.05833740234374,
1478
+ "completions/mean_terminated_length": 135.85120544433593,
1479
+ "completions/min_length": 47.8,
1480
+ "completions/min_terminated_length": 47.8,
1481
+ "entropy": 1.78580139875412,
1482
+ "epoch": 0.53,
1483
+ "frac_reward_zero_std": 0.0,
1484
+ "grad_norm": 3.726942777633667,
1485
+ "kl": 0.26940242052078245,
1486
+ "learning_rate": 3.6800000000000003e-06,
1487
+ "loss": 0.0129,
1488
+ "num_tokens": 3163621.0,
1489
+ "reward": 1.6984041452407836,
1490
+ "reward_std": 0.4623105704784393,
1491
+ "rewards/politeness_reward_func/mean": 1.69840407371521,
1492
+ "rewards/politeness_reward_func/std": 0.5121756374835968,
1493
+ "step": 265,
1494
+ "step_time": 10.816243136674165
1495
+ },
1496
+ {
1497
+ "clip_ratio/high_max": 0.0,
1498
+ "clip_ratio/high_mean": 0.0,
1499
+ "clip_ratio/low_mean": 0.0,
1500
+ "clip_ratio/low_min": 0.0,
1501
+ "clip_ratio/region_mean": 0.0,
1502
+ "completions/clipped_ratio": 0.37500000596046446,
1503
+ "completions/max_length": 256.0,
1504
+ "completions/max_terminated_length": 242.6,
1505
+ "completions/mean_length": 196.69584045410156,
1506
+ "completions/mean_terminated_length": 160.64813537597655,
1507
+ "completions/min_length": 66.4,
1508
+ "completions/min_terminated_length": 66.4,
1509
+ "entropy": 1.7563302516937256,
1510
+ "epoch": 0.54,
1511
+ "frac_reward_zero_std": 0.0,
1512
+ "grad_norm": 4.206515312194824,
1513
+ "kl": 0.3476646825671196,
1514
+ "learning_rate": 3.655e-06,
1515
+ "loss": 0.0463,
1516
+ "num_tokens": 3227836.0,
1517
+ "reward": 1.7140352249145507,
1518
+ "reward_std": 0.41706904768943787,
1519
+ "rewards/politeness_reward_func/mean": 1.714035153388977,
1520
+ "rewards/politeness_reward_func/std": 0.4722850799560547,
1521
+ "step": 270,
1522
+ "step_time": 10.934283661842347
1523
+ },
1524
+ {
1525
+ "clip_ratio/high_max": 0.0,
1526
+ "clip_ratio/high_mean": 0.0,
1527
+ "clip_ratio/low_mean": 0.0,
1528
+ "clip_ratio/low_min": 0.0,
1529
+ "clip_ratio/region_mean": 0.0,
1530
+ "completions/clipped_ratio": 0.43750002086162565,
1531
+ "completions/max_length": 256.0,
1532
+ "completions/max_terminated_length": 246.0,
1533
+ "completions/mean_length": 200.75416870117186,
1534
+ "completions/mean_terminated_length": 159.0064697265625,
1535
+ "completions/min_length": 59.6,
1536
+ "completions/min_terminated_length": 59.6,
1537
+ "entropy": 1.853849744796753,
1538
+ "epoch": 0.55,
1539
+ "frac_reward_zero_std": 0.0,
1540
+ "grad_norm": 3.3859314918518066,
1541
+ "kl": 0.2895158067345619,
1542
+ "learning_rate": 3.6300000000000004e-06,
1543
+ "loss": -0.0048,
1544
+ "num_tokens": 3292993.0,
1545
+ "reward": 1.8271270751953126,
1546
+ "reward_std": 0.42741702795028685,
1547
+ "rewards/politeness_reward_func/mean": 1.8271270513534545,
1548
+ "rewards/politeness_reward_func/std": 0.47546608448028566,
1549
+ "step": 275,
1550
+ "step_time": 10.752403935790062
1551
+ },
1552
+ {
1553
+ "clip_ratio/high_max": 0.0,
1554
+ "clip_ratio/high_mean": 0.0,
1555
+ "clip_ratio/low_mean": 0.0,
1556
+ "clip_ratio/low_min": 0.0,
1557
+ "clip_ratio/region_mean": 0.0,
1558
+ "completions/clipped_ratio": 0.35833334624767305,
1559
+ "completions/max_length": 256.0,
1560
+ "completions/max_terminated_length": 241.8,
1561
+ "completions/mean_length": 189.95000305175782,
1562
+ "completions/mean_terminated_length": 153.42260131835937,
1563
+ "completions/min_length": 57.8,
1564
+ "completions/min_terminated_length": 57.8,
1565
+ "entropy": 1.8662793278694152,
1566
+ "epoch": 0.56,
1567
+ "frac_reward_zero_std": 0.0,
1568
+ "grad_norm": 3.938138484954834,
1569
+ "kl": 0.29017033874988557,
1570
+ "learning_rate": 3.6050000000000002e-06,
1571
+ "loss": -0.0162,
1572
+ "num_tokens": 3355893.0,
1573
+ "reward": 1.7445276975631714,
1574
+ "reward_std": 0.4672766387462616,
1575
+ "rewards/politeness_reward_func/mean": 1.744527554512024,
1576
+ "rewards/politeness_reward_func/std": 0.4982309639453888,
1577
+ "step": 280,
1578
+ "step_time": 10.624229270964861
1579
+ },
1580
+ {
1581
+ "clip_ratio/high_max": 0.0,
1582
+ "clip_ratio/high_mean": 0.0,
1583
+ "clip_ratio/low_mean": 0.0,
1584
+ "clip_ratio/low_min": 0.0,
1585
+ "clip_ratio/region_mean": 0.0,
1586
+ "completions/clipped_ratio": 0.31250000596046446,
1587
+ "completions/max_length": 256.0,
1588
+ "completions/max_terminated_length": 249.4,
1589
+ "completions/mean_length": 190.03333740234376,
1590
+ "completions/mean_terminated_length": 162.05103149414063,
1591
+ "completions/min_length": 65.8,
1592
+ "completions/min_terminated_length": 65.8,
1593
+ "entropy": 1.74795663356781,
1594
+ "epoch": 0.57,
1595
+ "frac_reward_zero_std": 0.0,
1596
+ "grad_norm": 3.650832176208496,
1597
+ "kl": 0.21563028842210769,
1598
+ "learning_rate": 3.58e-06,
1599
+ "loss": 0.0183,
1600
+ "num_tokens": 3418413.0,
1601
+ "reward": 1.6855457305908204,
1602
+ "reward_std": 0.3864980161190033,
1603
+ "rewards/politeness_reward_func/mean": 1.6855456590652467,
1604
+ "rewards/politeness_reward_func/std": 0.4192918837070465,
1605
+ "step": 285,
1606
+ "step_time": 10.76889206841588
1607
+ },
1608
+ {
1609
+ "clip_ratio/high_max": 0.0,
1610
+ "clip_ratio/high_mean": 0.0,
1611
+ "clip_ratio/low_mean": 0.0,
1612
+ "clip_ratio/low_min": 0.0,
1613
+ "clip_ratio/region_mean": 0.0,
1614
+ "completions/clipped_ratio": 0.28750001192092894,
1615
+ "completions/max_length": 256.0,
1616
+ "completions/max_terminated_length": 240.4,
1617
+ "completions/mean_length": 174.78333740234376,
1618
+ "completions/mean_terminated_length": 144.00184173583983,
1619
+ "completions/min_length": 54.4,
1620
+ "completions/min_terminated_length": 54.4,
1621
+ "entropy": 1.5682233929634095,
1622
+ "epoch": 0.58,
1623
+ "frac_reward_zero_std": 0.0,
1624
+ "grad_norm": 4.823276042938232,
1625
+ "kl": 0.25128297358751295,
1626
+ "learning_rate": 3.5550000000000003e-06,
1627
+ "loss": 0.0429,
1628
+ "num_tokens": 3477785.0,
1629
+ "reward": 1.6693106412887573,
1630
+ "reward_std": 0.3742055296897888,
1631
+ "rewards/politeness_reward_func/mean": 1.6693106412887573,
1632
+ "rewards/politeness_reward_func/std": 0.4070769190788269,
1633
+ "step": 290,
1634
+ "step_time": 10.749481346458197
1635
+ },
1636
+ {
1637
+ "clip_ratio/high_max": 0.0,
1638
+ "clip_ratio/high_mean": 0.0,
1639
+ "clip_ratio/low_mean": 0.0,
1640
+ "clip_ratio/low_min": 0.0,
1641
+ "clip_ratio/region_mean": 0.0,
1642
+ "completions/clipped_ratio": 0.27083333730697634,
1643
+ "completions/max_length": 256.0,
1644
+ "completions/max_terminated_length": 240.8,
1645
+ "completions/mean_length": 174.8166717529297,
1646
+ "completions/mean_terminated_length": 145.73609619140626,
1647
+ "completions/min_length": 56.6,
1648
+ "completions/min_terminated_length": 56.6,
1649
+ "entropy": 1.7244809031486512,
1650
+ "epoch": 0.59,
1651
+ "frac_reward_zero_std": 0.0,
1652
+ "grad_norm": 5.0669989585876465,
1653
+ "kl": 0.28962416797876356,
1654
+ "learning_rate": 3.53e-06,
1655
+ "loss": 0.0175,
1656
+ "num_tokens": 3537101.0,
1657
+ "reward": 1.6176325321197509,
1658
+ "reward_std": 0.4998854100704193,
1659
+ "rewards/politeness_reward_func/mean": 1.6176324844360352,
1660
+ "rewards/politeness_reward_func/std": 0.5255676746368408,
1661
+ "step": 295,
1662
+ "step_time": 10.83822975307703
1663
+ },
1664
+ {
1665
+ "clip_ratio/high_max": 0.0,
1666
+ "clip_ratio/high_mean": 0.0,
1667
+ "clip_ratio/low_mean": 0.0,
1668
+ "clip_ratio/low_min": 0.0,
1669
+ "clip_ratio/region_mean": 0.0,
1670
+ "completions/clipped_ratio": 0.29166668355464936,
1671
+ "completions/max_length": 256.0,
1672
+ "completions/max_terminated_length": 249.2,
1673
+ "completions/mean_length": 176.39166870117188,
1674
+ "completions/mean_terminated_length": 145.56358642578124,
1675
+ "completions/min_length": 58.0,
1676
+ "completions/min_terminated_length": 58.0,
1677
+ "entropy": 1.7615143656730652,
1678
+ "epoch": 0.6,
1679
+ "frac_reward_zero_std": 0.0,
1680
+ "grad_norm": 9.959527015686035,
1681
+ "kl": 0.30573274195194244,
1682
+ "learning_rate": 3.505e-06,
1683
+ "loss": 0.0663,
1684
+ "num_tokens": 3597051.0,
1685
+ "reward": 1.7090127229690553,
1686
+ "reward_std": 0.4356145679950714,
1687
+ "rewards/politeness_reward_func/mean": 1.7090126276016235,
1688
+ "rewards/politeness_reward_func/std": 0.458771163225174,
1689
+ "step": 300,
1690
+ "step_time": 10.85788278505206
1691
+ },
1692
+ {
1693
+ "clip_ratio/high_max": 0.0,
1694
+ "clip_ratio/high_mean": 0.0,
1695
+ "clip_ratio/low_mean": 0.0,
1696
+ "clip_ratio/low_min": 0.0,
1697
+ "clip_ratio/region_mean": 0.0,
1698
+ "completions/clipped_ratio": 0.325000011920929,
1699
+ "completions/max_length": 256.0,
1700
+ "completions/max_terminated_length": 250.2,
1701
+ "completions/mean_length": 181.22083740234376,
1702
+ "completions/mean_terminated_length": 146.1617431640625,
1703
+ "completions/min_length": 55.0,
1704
+ "completions/min_terminated_length": 55.0,
1705
+ "entropy": 1.9170334219932557,
1706
+ "epoch": 0.61,
1707
+ "frac_reward_zero_std": 0.0,
1708
+ "grad_norm": 4.597347259521484,
1709
+ "kl": 0.2534967973828316,
1710
+ "learning_rate": 3.48e-06,
1711
+ "loss": -0.019,
1712
+ "num_tokens": 3657680.0,
1713
+ "reward": 1.8059730291366578,
1714
+ "reward_std": 0.4023502767086029,
1715
+ "rewards/politeness_reward_func/mean": 1.8059730291366578,
1716
+ "rewards/politeness_reward_func/std": 0.42335187196731566,
1717
+ "step": 305,
1718
+ "step_time": 10.813059192150831
1719
+ },
1720
+ {
1721
+ "clip_ratio/high_max": 0.0,
1722
+ "clip_ratio/high_mean": 0.0,
1723
+ "clip_ratio/low_mean": 0.0,
1724
+ "clip_ratio/low_min": 0.0,
1725
+ "clip_ratio/region_mean": 0.0,
1726
+ "completions/clipped_ratio": 0.3458333432674408,
1727
+ "completions/max_length": 256.0,
1728
+ "completions/max_terminated_length": 247.6,
1729
+ "completions/mean_length": 192.33333435058594,
1730
+ "completions/mean_terminated_length": 158.2584686279297,
1731
+ "completions/min_length": 61.0,
1732
+ "completions/min_terminated_length": 61.0,
1733
+ "entropy": 1.7168800473213195,
1734
+ "epoch": 0.62,
1735
+ "frac_reward_zero_std": 0.0,
1736
+ "grad_norm": 3.3291268348693848,
1737
+ "kl": 0.30181923806667327,
1738
+ "learning_rate": 3.455e-06,
1739
+ "loss": 0.0438,
1740
+ "num_tokens": 3721168.0,
1741
+ "reward": 1.6650686740875245,
1742
+ "reward_std": 0.4736558377742767,
1743
+ "rewards/politeness_reward_func/mean": 1.6650686979293823,
1744
+ "rewards/politeness_reward_func/std": 0.5267638087272644,
1745
+ "step": 310,
1746
+ "step_time": 10.904363192617893
1747
+ },
1748
+ {
1749
+ "clip_ratio/high_max": 0.0,
1750
+ "clip_ratio/high_mean": 0.0,
1751
+ "clip_ratio/low_mean": 0.0,
1752
+ "clip_ratio/low_min": 0.0,
1753
+ "clip_ratio/region_mean": 0.0,
1754
+ "completions/clipped_ratio": 0.2708333432674408,
1755
+ "completions/max_length": 256.0,
1756
+ "completions/max_terminated_length": 248.4,
1757
+ "completions/mean_length": 178.20833740234374,
1758
+ "completions/mean_terminated_length": 150.5656768798828,
1759
+ "completions/min_length": 52.8,
1760
+ "completions/min_terminated_length": 52.8,
1761
+ "entropy": 1.6762184143066405,
1762
+ "epoch": 0.63,
1763
+ "frac_reward_zero_std": 0.0,
1764
+ "grad_norm": 5.4125494956970215,
1765
+ "kl": 0.3411761596798897,
1766
+ "learning_rate": 3.4300000000000006e-06,
1767
+ "loss": 0.0739,
1768
+ "num_tokens": 3781170.0,
1769
+ "reward": 1.698883295059204,
1770
+ "reward_std": 0.5051921904087067,
1771
+ "rewards/politeness_reward_func/mean": 1.6988832235336304,
1772
+ "rewards/politeness_reward_func/std": 0.5214821577072144,
1773
+ "step": 315,
1774
+ "step_time": 10.870812387019395
1775
+ },
1776
+ {
1777
+ "clip_ratio/high_max": 0.0,
1778
+ "clip_ratio/high_mean": 0.0,
1779
+ "clip_ratio/low_mean": 0.0,
1780
+ "clip_ratio/low_min": 0.0,
1781
+ "clip_ratio/region_mean": 0.0,
1782
+ "completions/clipped_ratio": 0.3333333432674408,
1783
+ "completions/max_length": 256.0,
1784
+ "completions/max_terminated_length": 243.0,
1785
+ "completions/mean_length": 174.2375061035156,
1786
+ "completions/mean_terminated_length": 133.60752563476564,
1787
+ "completions/min_length": 38.8,
1788
+ "completions/min_terminated_length": 38.8,
1789
+ "entropy": 1.608705174922943,
1790
+ "epoch": 0.64,
1791
+ "frac_reward_zero_std": 0.0,
1792
+ "grad_norm": 4.398674011230469,
1793
+ "kl": 0.3067095875740051,
1794
+ "learning_rate": 3.4050000000000004e-06,
1795
+ "loss": 0.0628,
1796
+ "num_tokens": 3840075.0,
1797
+ "reward": 1.6573886156082154,
1798
+ "reward_std": 0.4343807339668274,
1799
+ "rewards/politeness_reward_func/mean": 1.6573885917663573,
1800
+ "rewards/politeness_reward_func/std": 0.4526840627193451,
1801
+ "step": 320,
1802
+ "step_time": 10.640466004610062
1803
+ },
1804
+ {
1805
+ "clip_ratio/high_max": 0.0,
1806
+ "clip_ratio/high_mean": 0.0,
1807
+ "clip_ratio/low_mean": 0.0,
1808
+ "clip_ratio/low_min": 0.0,
1809
+ "clip_ratio/region_mean": 0.0,
1810
+ "completions/clipped_ratio": 0.2666666716337204,
1811
+ "completions/max_length": 256.0,
1812
+ "completions/max_terminated_length": 248.8,
1813
+ "completions/mean_length": 177.06666870117186,
1814
+ "completions/mean_terminated_length": 148.47830200195312,
1815
+ "completions/min_length": 59.6,
1816
+ "completions/min_terminated_length": 59.6,
1817
+ "entropy": 1.832581627368927,
1818
+ "epoch": 0.65,
1819
+ "frac_reward_zero_std": 0.0,
1820
+ "grad_norm": 6.723278522491455,
1821
+ "kl": 0.2649755135178566,
1822
+ "learning_rate": 3.3800000000000007e-06,
1823
+ "loss": -0.026,
1824
+ "num_tokens": 3899579.0,
1825
+ "reward": 1.81795072555542,
1826
+ "reward_std": 0.39153199791908266,
1827
+ "rewards/politeness_reward_func/mean": 1.8179506063461304,
1828
+ "rewards/politeness_reward_func/std": 0.41539669036865234,
1829
+ "step": 325,
1830
+ "step_time": 10.751856955885888
1831
+ },
1832
+ {
1833
+ "clip_ratio/high_max": 0.0,
1834
+ "clip_ratio/high_mean": 0.0,
1835
+ "clip_ratio/low_mean": 0.0,
1836
+ "clip_ratio/low_min": 0.0,
1837
+ "clip_ratio/region_mean": 0.0,
1838
+ "completions/clipped_ratio": 0.24583333730697632,
1839
+ "completions/max_length": 256.0,
1840
+ "completions/max_terminated_length": 238.2,
1841
+ "completions/mean_length": 164.57083740234376,
1842
+ "completions/mean_terminated_length": 135.28192138671875,
1843
+ "completions/min_length": 47.6,
1844
+ "completions/min_terminated_length": 47.6,
1845
+ "entropy": 1.6712656617164612,
1846
+ "epoch": 0.66,
1847
+ "frac_reward_zero_std": 0.0,
1848
+ "grad_norm": 4.251152992248535,
1849
+ "kl": 0.2780707836151123,
1850
+ "learning_rate": 3.3550000000000005e-06,
1851
+ "loss": 0.0515,
1852
+ "num_tokens": 3956132.0,
1853
+ "reward": 1.6795947074890136,
1854
+ "reward_std": 0.4304037630558014,
1855
+ "rewards/politeness_reward_func/mean": 1.6795947313308717,
1856
+ "rewards/politeness_reward_func/std": 0.456265515089035,
1857
+ "step": 330,
1858
+ "step_time": 10.65747187435627
1859
+ },
1860
+ {
1861
+ "clip_ratio/high_max": 0.0,
1862
+ "clip_ratio/high_mean": 0.0,
1863
+ "clip_ratio/low_mean": 0.0,
1864
+ "clip_ratio/low_min": 0.0,
1865
+ "clip_ratio/region_mean": 0.0,
1866
+ "completions/clipped_ratio": 0.27083334028720857,
1867
+ "completions/max_length": 256.0,
1868
+ "completions/max_terminated_length": 250.8,
1869
+ "completions/mean_length": 170.53750610351562,
1870
+ "completions/mean_terminated_length": 139.22777709960937,
1871
+ "completions/min_length": 48.4,
1872
+ "completions/min_terminated_length": 48.4,
1873
+ "entropy": 1.6737295269966126,
1874
+ "epoch": 0.67,
1875
+ "frac_reward_zero_std": 0.0,
1876
+ "grad_norm": 3.981579065322876,
1877
+ "kl": 0.27397735267877577,
1878
+ "learning_rate": 3.3300000000000003e-06,
1879
+ "loss": 0.0586,
1880
+ "num_tokens": 4014149.0,
1881
+ "reward": 1.639498805999756,
1882
+ "reward_std": 0.3987973630428314,
1883
+ "rewards/politeness_reward_func/mean": 1.639498782157898,
1884
+ "rewards/politeness_reward_func/std": 0.4488051861524582,
1885
+ "step": 335,
1886
+ "step_time": 10.64305683746934
1887
+ },
1888
+ {
1889
+ "clip_ratio/high_max": 0.0,
1890
+ "clip_ratio/high_mean": 0.0,
1891
+ "clip_ratio/low_mean": 0.0,
1892
+ "clip_ratio/low_min": 0.0,
1893
+ "clip_ratio/region_mean": 0.0,
1894
+ "completions/clipped_ratio": 0.24583334028720855,
1895
+ "completions/max_length": 256.0,
1896
+ "completions/max_terminated_length": 247.6,
1897
+ "completions/mean_length": 170.57083435058593,
1898
+ "completions/mean_terminated_length": 143.0279296875,
1899
+ "completions/min_length": 53.6,
1900
+ "completions/min_terminated_length": 53.6,
1901
+ "entropy": 1.8374268651008605,
1902
+ "epoch": 0.68,
1903
+ "frac_reward_zero_std": 0.0,
1904
+ "grad_norm": 5.304769515991211,
1905
+ "kl": 0.2890111759305,
1906
+ "learning_rate": 3.3050000000000005e-06,
1907
+ "loss": 0.0539,
1908
+ "num_tokens": 4072318.0,
1909
+ "reward": 1.827410674095154,
1910
+ "reward_std": 0.34623334407806394,
1911
+ "rewards/politeness_reward_func/mean": 1.8274105072021485,
1912
+ "rewards/politeness_reward_func/std": 0.3635041773319244,
1913
+ "step": 340,
1914
+ "step_time": 10.67860155031085
1915
+ },
1916
+ {
1917
+ "clip_ratio/high_max": 0.0,
1918
+ "clip_ratio/high_mean": 0.0,
1919
+ "clip_ratio/low_mean": 0.0,
1920
+ "clip_ratio/low_min": 0.0,
1921
+ "clip_ratio/region_mean": 0.0,
1922
+ "completions/clipped_ratio": 0.3791666865348816,
1923
+ "completions/max_length": 256.0,
1924
+ "completions/max_terminated_length": 246.8,
1925
+ "completions/mean_length": 182.3541717529297,
1926
+ "completions/mean_terminated_length": 138.33062896728515,
1927
+ "completions/min_length": 54.4,
1928
+ "completions/min_terminated_length": 54.4,
1929
+ "entropy": 1.8917979717254638,
1930
+ "epoch": 0.69,
1931
+ "frac_reward_zero_std": 0.0,
1932
+ "grad_norm": 3.783524751663208,
1933
+ "kl": 0.3112198129296303,
1934
+ "learning_rate": 3.2800000000000004e-06,
1935
+ "loss": -0.0079,
1936
+ "num_tokens": 4133395.0,
1937
+ "reward": 1.8497365951538085,
1938
+ "reward_std": 0.41627883315086367,
1939
+ "rewards/politeness_reward_func/mean": 1.8497365236282348,
1940
+ "rewards/politeness_reward_func/std": 0.4410283327102661,
1941
+ "step": 345,
1942
+ "step_time": 10.664074825495481
1943
+ },
1944
+ {
1945
+ "clip_ratio/high_max": 0.0,
1946
+ "clip_ratio/high_mean": 0.0,
1947
+ "clip_ratio/low_mean": 0.0,
1948
+ "clip_ratio/low_min": 0.0,
1949
+ "clip_ratio/region_mean": 0.0,
1950
+ "completions/clipped_ratio": 0.43750001192092897,
1951
+ "completions/max_length": 256.0,
1952
+ "completions/max_terminated_length": 250.8,
1953
+ "completions/mean_length": 192.87083740234374,
1954
+ "completions/mean_terminated_length": 144.26836700439452,
1955
+ "completions/min_length": 62.4,
1956
+ "completions/min_terminated_length": 62.4,
1957
+ "entropy": 1.9642783045768737,
1958
+ "epoch": 0.7,
1959
+ "frac_reward_zero_std": 0.0,
1960
+ "grad_norm": 3.8300986289978027,
1961
+ "kl": 0.3175328865647316,
1962
+ "learning_rate": 3.255e-06,
1963
+ "loss": 0.033,
1964
+ "num_tokens": 4197028.0,
1965
+ "reward": 1.8747668027877809,
1966
+ "reward_std": 0.485896635055542,
1967
+ "rewards/politeness_reward_func/mean": 1.8747668027877809,
1968
+ "rewards/politeness_reward_func/std": 0.5172119975090027,
1969
+ "step": 350,
1970
+ "step_time": 10.599913079291582
1971
+ },
1972
+ {
1973
+ "clip_ratio/high_max": 0.0,
1974
+ "clip_ratio/high_mean": 0.0,
1975
+ "clip_ratio/low_mean": 0.0,
1976
+ "clip_ratio/low_min": 0.0,
1977
+ "clip_ratio/region_mean": 0.0,
1978
+ "completions/clipped_ratio": 0.30416667759418486,
1979
+ "completions/max_length": 256.0,
1980
+ "completions/max_terminated_length": 245.4,
1981
+ "completions/mean_length": 173.94584045410156,
1982
+ "completions/mean_terminated_length": 138.12840881347657,
1983
+ "completions/min_length": 50.4,
1984
+ "completions/min_terminated_length": 50.4,
1985
+ "entropy": 1.6719454765319823,
1986
+ "epoch": 0.71,
1987
+ "frac_reward_zero_std": 0.0,
1988
+ "grad_norm": 3.62943172454834,
1989
+ "kl": 0.354923115670681,
1990
+ "learning_rate": 3.2300000000000004e-06,
1991
+ "loss": 0.04,
1992
+ "num_tokens": 4256759.0,
1993
+ "reward": 1.746033239364624,
1994
+ "reward_std": 0.46536014080047605,
1995
+ "rewards/politeness_reward_func/mean": 1.7460331916809082,
1996
+ "rewards/politeness_reward_func/std": 0.513632845878601,
1997
+ "step": 355,
1998
+ "step_time": 10.574684323370457
1999
+ },
2000
+ {
2001
+ "clip_ratio/high_max": 0.0,
2002
+ "clip_ratio/high_mean": 0.0,
2003
+ "clip_ratio/low_mean": 0.0,
2004
+ "clip_ratio/low_min": 0.0,
2005
+ "clip_ratio/region_mean": 0.0,
2006
+ "completions/clipped_ratio": 0.2166666716337204,
2007
+ "completions/max_length": 256.0,
2008
+ "completions/max_terminated_length": 246.4,
2009
+ "completions/mean_length": 164.38750610351562,
2010
+ "completions/mean_terminated_length": 139.19703826904296,
2011
+ "completions/min_length": 58.2,
2012
+ "completions/min_terminated_length": 58.2,
2013
+ "entropy": 1.7568098187446595,
2014
+ "epoch": 0.72,
2015
+ "frac_reward_zero_std": 0.0,
2016
+ "grad_norm": 6.490025997161865,
2017
+ "kl": 0.2866540029644966,
2018
+ "learning_rate": 3.2050000000000002e-06,
2019
+ "loss": -0.0106,
2020
+ "num_tokens": 4313620.0,
2021
+ "reward": 1.7767923593521118,
2022
+ "reward_std": 0.4481996238231659,
2023
+ "rewards/politeness_reward_func/mean": 1.776792311668396,
2024
+ "rewards/politeness_reward_func/std": 0.4569097697734833,
2025
+ "step": 360,
2026
+ "step_time": 10.563734823465348
2027
+ },
2028
+ {
2029
+ "clip_ratio/high_max": 0.0,
2030
+ "clip_ratio/high_mean": 0.0,
2031
+ "clip_ratio/low_mean": 0.0,
2032
+ "clip_ratio/low_min": 0.0,
2033
+ "clip_ratio/region_mean": 0.0,
2034
+ "completions/clipped_ratio": 0.3833333432674408,
2035
+ "completions/max_length": 256.0,
2036
+ "completions/max_terminated_length": 249.8,
2037
+ "completions/mean_length": 192.51250305175782,
2038
+ "completions/mean_terminated_length": 152.48777465820314,
2039
+ "completions/min_length": 54.8,
2040
+ "completions/min_terminated_length": 54.8,
2041
+ "entropy": 1.7603564739227295,
2042
+ "epoch": 0.73,
2043
+ "frac_reward_zero_std": 0.0,
2044
+ "grad_norm": 3.731227159500122,
2045
+ "kl": 0.2559724062681198,
2046
+ "learning_rate": 3.1800000000000005e-06,
2047
+ "loss": 0.0058,
2048
+ "num_tokens": 4377423.0,
2049
+ "reward": 1.7886356353759765,
2050
+ "reward_std": 0.43659440279006956,
2051
+ "rewards/politeness_reward_func/mean": 1.7886356592178345,
2052
+ "rewards/politeness_reward_func/std": 0.4865987479686737,
2053
+ "step": 365,
2054
+ "step_time": 10.596465566009282
2055
+ },
2056
+ {
2057
+ "clip_ratio/high_max": 0.0,
2058
+ "clip_ratio/high_mean": 0.0,
2059
+ "clip_ratio/low_mean": 0.0,
2060
+ "clip_ratio/low_min": 0.0,
2061
+ "clip_ratio/region_mean": 0.0,
2062
+ "completions/clipped_ratio": 0.3208333432674408,
2063
+ "completions/max_length": 256.0,
2064
+ "completions/max_terminated_length": 253.0,
2065
+ "completions/mean_length": 183.37500610351563,
2066
+ "completions/mean_terminated_length": 150.60201873779297,
2067
+ "completions/min_length": 56.2,
2068
+ "completions/min_terminated_length": 56.2,
2069
+ "entropy": 1.599543821811676,
2070
+ "epoch": 0.74,
2071
+ "frac_reward_zero_std": 0.0,
2072
+ "grad_norm": 4.495799541473389,
2073
+ "kl": 0.26432290226221083,
2074
+ "learning_rate": 3.1550000000000003e-06,
2075
+ "loss": 0.0365,
2076
+ "num_tokens": 4438841.0,
2077
+ "reward": 1.663888192176819,
2078
+ "reward_std": 0.4724281966686249,
2079
+ "rewards/politeness_reward_func/mean": 1.663888168334961,
2080
+ "rewards/politeness_reward_func/std": 0.49500845074653627,
2081
+ "step": 370,
2082
+ "step_time": 10.550004740059375
2083
+ },
2084
+ {
2085
+ "clip_ratio/high_max": 0.0,
2086
+ "clip_ratio/high_mean": 0.0,
2087
+ "clip_ratio/low_mean": 0.0,
2088
+ "clip_ratio/low_min": 0.0,
2089
+ "clip_ratio/region_mean": 0.0,
2090
+ "completions/clipped_ratio": 0.2833333402872086,
2091
+ "completions/max_length": 256.0,
2092
+ "completions/max_terminated_length": 245.8,
2093
+ "completions/mean_length": 176.43333740234374,
2094
+ "completions/mean_terminated_length": 145.9316650390625,
2095
+ "completions/min_length": 46.8,
2096
+ "completions/min_terminated_length": 46.8,
2097
+ "entropy": 1.649828588962555,
2098
+ "epoch": 0.75,
2099
+ "frac_reward_zero_std": 0.0,
2100
+ "grad_norm": 4.405642509460449,
2101
+ "kl": 0.3023358851671219,
2102
+ "learning_rate": 3.13e-06,
2103
+ "loss": 0.036,
2104
+ "num_tokens": 4498209.0,
2105
+ "reward": 1.7260100841522217,
2106
+ "reward_std": 0.4888251006603241,
2107
+ "rewards/politeness_reward_func/mean": 1.726010012626648,
2108
+ "rewards/politeness_reward_func/std": 0.5257126033306122,
2109
+ "step": 375,
2110
+ "step_time": 10.784504148364068
2111
+ },
2112
+ {
2113
+ "clip_ratio/high_max": 0.0,
2114
+ "clip_ratio/high_mean": 0.0,
2115
+ "clip_ratio/low_mean": 0.0,
2116
+ "clip_ratio/low_min": 0.0,
2117
+ "clip_ratio/region_mean": 0.0,
2118
+ "completions/clipped_ratio": 0.2750000059604645,
2119
+ "completions/max_length": 256.0,
2120
+ "completions/max_terminated_length": 251.0,
2121
+ "completions/mean_length": 177.4791717529297,
2122
+ "completions/mean_terminated_length": 149.0453887939453,
2123
+ "completions/min_length": 52.6,
2124
+ "completions/min_terminated_length": 52.6,
2125
+ "entropy": 1.669038712978363,
2126
+ "epoch": 0.76,
2127
+ "frac_reward_zero_std": 0.0,
2128
+ "grad_norm": 4.453659534454346,
2129
+ "kl": 0.26074831187725067,
2130
+ "learning_rate": 3.1050000000000003e-06,
2131
+ "loss": 0.0133,
2132
+ "num_tokens": 4558116.0,
2133
+ "reward": 1.7338483333587646,
2134
+ "reward_std": 0.40128968358039857,
2135
+ "rewards/politeness_reward_func/mean": 1.7338482856750488,
2136
+ "rewards/politeness_reward_func/std": 0.4201090157032013,
2137
+ "step": 380,
2138
+ "step_time": 10.85576168820262
2139
+ },
2140
+ {
2141
+ "clip_ratio/high_max": 0.0,
2142
+ "clip_ratio/high_mean": 0.0,
2143
+ "clip_ratio/low_mean": 0.0,
2144
+ "clip_ratio/low_min": 0.0,
2145
+ "clip_ratio/region_mean": 0.0,
2146
+ "completions/clipped_ratio": 0.3166666775941849,
2147
+ "completions/max_length": 256.0,
2148
+ "completions/max_terminated_length": 248.4,
2149
+ "completions/mean_length": 175.9791687011719,
2150
+ "completions/mean_terminated_length": 141.09287872314454,
2151
+ "completions/min_length": 60.8,
2152
+ "completions/min_terminated_length": 60.8,
2153
+ "entropy": 1.70971120595932,
2154
+ "epoch": 0.77,
2155
+ "frac_reward_zero_std": 0.0,
2156
+ "grad_norm": 4.21511173248291,
2157
+ "kl": 0.33568228632211683,
2158
+ "learning_rate": 3.08e-06,
2159
+ "loss": 0.0539,
2160
+ "num_tokens": 4617679.0,
2161
+ "reward": 1.8021876573562623,
2162
+ "reward_std": 0.39429293274879457,
2163
+ "rewards/politeness_reward_func/mean": 1.8021875858306884,
2164
+ "rewards/politeness_reward_func/std": 0.4176251709461212,
2165
+ "step": 385,
2166
+ "step_time": 10.889974600821734
2167
+ },
2168
+ {
2169
+ "clip_ratio/high_max": 0.0,
2170
+ "clip_ratio/high_mean": 0.0,
2171
+ "clip_ratio/low_mean": 0.0,
2172
+ "clip_ratio/low_min": 0.0,
2173
+ "clip_ratio/region_mean": 0.0,
2174
+ "completions/clipped_ratio": 0.24583333879709243,
2175
+ "completions/max_length": 256.0,
2176
+ "completions/max_terminated_length": 250.4,
2177
+ "completions/mean_length": 162.72084045410156,
2178
+ "completions/mean_terminated_length": 133.57429962158204,
2179
+ "completions/min_length": 56.8,
2180
+ "completions/min_terminated_length": 56.8,
2181
+ "entropy": 1.5501957058906555,
2182
+ "epoch": 0.78,
2183
+ "frac_reward_zero_std": 0.0,
2184
+ "grad_norm": 5.835730075836182,
2185
+ "kl": 0.30908732712268827,
2186
+ "learning_rate": 3.0550000000000004e-06,
2187
+ "loss": 0.0817,
2188
+ "num_tokens": 4674108.0,
2189
+ "reward": 1.7301536321640014,
2190
+ "reward_std": 0.44032427072525027,
2191
+ "rewards/politeness_reward_func/mean": 1.7301535844802856,
2192
+ "rewards/politeness_reward_func/std": 0.4708732008934021,
2193
+ "step": 390,
2194
+ "step_time": 10.865944185107946
2195
+ },
2196
+ {
2197
+ "clip_ratio/high_max": 0.0,
2198
+ "clip_ratio/high_mean": 0.0,
2199
+ "clip_ratio/low_mean": 0.0,
2200
+ "clip_ratio/low_min": 0.0,
2201
+ "clip_ratio/region_mean": 0.0,
2202
+ "completions/clipped_ratio": 0.1791666716337204,
2203
+ "completions/max_length": 256.0,
2204
+ "completions/max_terminated_length": 247.8,
2205
+ "completions/mean_length": 157.25000915527343,
2206
+ "completions/mean_terminated_length": 135.5115005493164,
2207
+ "completions/min_length": 51.2,
2208
+ "completions/min_terminated_length": 51.2,
2209
+ "entropy": 1.6409943580627442,
2210
+ "epoch": 0.79,
2211
+ "frac_reward_zero_std": 0.0,
2212
+ "grad_norm": 6.950301647186279,
2213
+ "kl": 0.36195332258939744,
2214
+ "learning_rate": 3.0300000000000002e-06,
2215
+ "loss": 0.0805,
2216
+ "num_tokens": 4729176.0,
2217
+ "reward": 1.737137007713318,
2218
+ "reward_std": 0.4313428819179535,
2219
+ "rewards/politeness_reward_func/mean": 1.737137007713318,
2220
+ "rewards/politeness_reward_func/std": 0.4497229218482971,
2221
+ "step": 395,
2222
+ "step_time": 10.928832749277353
2223
+ },
2224
+ {
2225
+ "clip_ratio/high_max": 0.0,
2226
+ "clip_ratio/high_mean": 0.0,
2227
+ "clip_ratio/low_mean": 0.0,
2228
+ "clip_ratio/low_min": 0.0,
2229
+ "clip_ratio/region_mean": 0.0,
2230
+ "completions/clipped_ratio": 0.3291666775941849,
2231
+ "completions/max_length": 256.0,
2232
+ "completions/max_terminated_length": 250.4,
2233
+ "completions/mean_length": 180.15833740234376,
2234
+ "completions/mean_terminated_length": 143.84142608642577,
2235
+ "completions/min_length": 58.0,
2236
+ "completions/min_terminated_length": 58.0,
2237
+ "entropy": 1.7865488886833192,
2238
+ "epoch": 0.8,
2239
+ "frac_reward_zero_std": 0.0,
2240
+ "grad_norm": 3.5700058937072754,
2241
+ "kl": 0.2882836848497391,
2242
+ "learning_rate": 3.005e-06,
2243
+ "loss": 0.0432,
2244
+ "num_tokens": 4790094.0,
2245
+ "reward": 1.7929600477218628,
2246
+ "reward_std": 0.4272151470184326,
2247
+ "rewards/politeness_reward_func/mean": 1.792960000038147,
2248
+ "rewards/politeness_reward_func/std": 0.4529218733310699,
2249
+ "step": 400,
2250
+ "step_time": 10.825692503154277
2251
+ }
2252
+ ],
2253
+ "logging_steps": 5,
2254
+ "max_steps": 1000,
2255
+ "num_input_tokens_seen": 4790094,
2256
+ "num_train_epochs": 2,
2257
+ "save_steps": 100,
2258
+ "stateful_callbacks": {
2259
+ "TrainerControl": {
2260
+ "args": {
2261
+ "should_epoch_stop": false,
2262
+ "should_evaluate": false,
2263
+ "should_log": false,
2264
+ "should_save": true,
2265
+ "should_training_stop": false
2266
+ },
2267
+ "attributes": {}
2268
+ }
2269
+ },
2270
+ "total_flos": 0.0,
2271
+ "train_batch_size": 4,
2272
+ "trial_name": null,
2273
+ "trial_params": null
2274
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82b31eee2e9e5185f99ca7159dbcba50f81778df0098ecfea28a3e0cfd83b36e
3
+ size 7569
vocab.json ADDED
The diff for this file is too large to render. See raw diff