loocorez commited on
Commit
f403259
·
verified ·
1 Parent(s): f215909

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 32768,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.4",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151936
28
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.52.4",
6
+ "use_cache": false
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e384480884b766f3a89ba0eec8a718a17b84c2a7c634c9c6494cafee877d0f
3
+ size 988097824
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
trainer_state.json ADDED
@@ -0,0 +1,2024 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.009184423218221896,
6
+ "eval_steps": 10,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 1911.0,
20
+ "completions/max_terminated_length": 1911.0,
21
+ "completions/mean_length": 737.8,
22
+ "completions/mean_terminated_length": 737.8,
23
+ "completions/min_length": 9.0,
24
+ "completions/min_terminated_length": 9.0,
25
+ "epoch": 9.184423218221895e-05,
26
+ "grad_norm": 2.7607495142433,
27
+ "kl": 0.00098419189453125,
28
+ "learning_rate": 0.0,
29
+ "loss": -0.6015,
30
+ "num_tokens": 43442.0,
31
+ "reward": 0.15676024556159973,
32
+ "reward_std": 0.0992189422249794,
33
+ "rewards/format_reward_func": 0.6000000238418579,
34
+ "rewards/lcs_reward_func": 0.03676025941967964,
35
+ "step": 1
36
+ },
37
+ {
38
+ "clip_ratio/high_max": 0.0,
39
+ "clip_ratio/high_mean": 0.0,
40
+ "clip_ratio/low_mean": 0.0,
41
+ "clip_ratio/low_min": 0.0,
42
+ "clip_ratio/region_mean": 0.0,
43
+ "epoch": 0.0001836884643644379,
44
+ "grad_norm": 2.7607084486431908,
45
+ "kl": 0.00098419189453125,
46
+ "learning_rate": 1e-07,
47
+ "loss": -0.6015,
48
+ "step": 2
49
+ },
50
+ {
51
+ "clip_ratio/high_max": 0.0,
52
+ "clip_ratio/high_mean": 0.0,
53
+ "clip_ratio/low_mean": 0.0,
54
+ "clip_ratio/low_min": 0.0,
55
+ "clip_ratio/region_mean": 0.0,
56
+ "completions/clipped_ratio": 0.0,
57
+ "completions/max_length": 1826.0,
58
+ "completions/max_terminated_length": 1826.0,
59
+ "completions/mean_length": 739.525,
60
+ "completions/mean_terminated_length": 739.525,
61
+ "completions/min_length": 9.0,
62
+ "completions/min_terminated_length": 9.0,
63
+ "epoch": 0.0002755326965466569,
64
+ "grad_norm": 2.5034917199753006,
65
+ "kl": 0.0010004043579101562,
66
+ "learning_rate": 2e-07,
67
+ "loss": -0.543,
68
+ "num_tokens": 85793.0,
69
+ "reward": 0.15654990077018738,
70
+ "reward_std": 0.09047843515872955,
71
+ "rewards/format_reward_func": 0.6500000357627869,
72
+ "rewards/lcs_reward_func": 0.02654988504946232,
73
+ "step": 3
74
+ },
75
+ {
76
+ "clip_ratio/high_max": 0.0,
77
+ "clip_ratio/high_mean": 0.0,
78
+ "clip_ratio/low_mean": 0.0,
79
+ "clip_ratio/low_min": 0.0,
80
+ "clip_ratio/region_mean": 0.0,
81
+ "epoch": 0.0003673769287288758,
82
+ "grad_norm": 2.5049494762583047,
83
+ "kl": 0.001041412353515625,
84
+ "learning_rate": 3e-07,
85
+ "loss": -0.543,
86
+ "step": 4
87
+ },
88
+ {
89
+ "clip_ratio/high_max": 0.0,
90
+ "clip_ratio/high_mean": 0.0,
91
+ "clip_ratio/low_mean": 0.0,
92
+ "clip_ratio/low_min": 0.0,
93
+ "clip_ratio/region_mean": 0.0,
94
+ "completions/clipped_ratio": 0.0,
95
+ "completions/max_length": 1984.0,
96
+ "completions/max_terminated_length": 1984.0,
97
+ "completions/mean_length": 722.925,
98
+ "completions/mean_terminated_length": 722.925,
99
+ "completions/min_length": 9.0,
100
+ "completions/min_terminated_length": 9.0,
101
+ "epoch": 0.0004592211609110948,
102
+ "grad_norm": 3.0465073005843717,
103
+ "kl": 0.0007085800170898438,
104
+ "learning_rate": 4e-07,
105
+ "loss": -0.6571,
106
+ "num_tokens": 127850.0,
107
+ "reward": 0.15640318393707275,
108
+ "reward_std": 0.08817142993211746,
109
+ "rewards/format_reward_func": 0.6700000166893005,
110
+ "rewards/lcs_reward_func": 0.022403182461857796,
111
+ "step": 5
112
+ },
113
+ {
114
+ "clip_ratio/high_max": 0.0,
115
+ "clip_ratio/high_mean": 0.0,
116
+ "clip_ratio/low_mean": 0.0,
117
+ "clip_ratio/low_min": 0.0,
118
+ "clip_ratio/region_mean": 0.0,
119
+ "epoch": 0.0005510653930933138,
120
+ "grad_norm": 3.0448516969487267,
121
+ "kl": 0.000705718994140625,
122
+ "learning_rate": 5e-07,
123
+ "loss": -0.6571,
124
+ "step": 6
125
+ },
126
+ {
127
+ "clip_ratio/high_max": 0.0,
128
+ "clip_ratio/high_mean": 0.0,
129
+ "clip_ratio/low_mean": 0.0,
130
+ "clip_ratio/low_min": 0.0,
131
+ "clip_ratio/region_mean": 0.0,
132
+ "completions/clipped_ratio": 0.0,
133
+ "completions/max_length": 2029.0,
134
+ "completions/max_terminated_length": 2029.0,
135
+ "completions/mean_length": 894.325,
136
+ "completions/mean_terminated_length": 894.325,
137
+ "completions/min_length": 9.0,
138
+ "completions/min_terminated_length": 9.0,
139
+ "epoch": 0.0006429096252755327,
140
+ "grad_norm": 1.8772862682834104,
141
+ "kl": 0.0007915496826171875,
142
+ "learning_rate": 6e-07,
143
+ "loss": -0.3071,
144
+ "num_tokens": 176853.0,
145
+ "reward": 0.201990008354187,
146
+ "reward_std": 0.11269382387399673,
147
+ "rewards/format_reward_func": 0.6550000309944153,
148
+ "rewards/lcs_reward_func": 0.07099001854658127,
149
+ "step": 7
150
+ },
151
+ {
152
+ "clip_ratio/high_max": 0.0,
153
+ "clip_ratio/high_mean": 0.0,
154
+ "clip_ratio/low_mean": 0.0,
155
+ "clip_ratio/low_min": 0.0,
156
+ "clip_ratio/region_mean": 0.0,
157
+ "epoch": 0.0007347538574577516,
158
+ "grad_norm": 1.8735448714531584,
159
+ "kl": 0.0007781982421875,
160
+ "learning_rate": 7e-07,
161
+ "loss": -0.3071,
162
+ "step": 8
163
+ },
164
+ {
165
+ "clip_ratio/high_max": 0.0,
166
+ "clip_ratio/high_mean": 0.0,
167
+ "clip_ratio/low_mean": 0.0,
168
+ "clip_ratio/low_min": 0.0,
169
+ "clip_ratio/region_mean": 0.0,
170
+ "completions/clipped_ratio": 0.0,
171
+ "completions/max_length": 1736.0,
172
+ "completions/max_terminated_length": 1736.0,
173
+ "completions/mean_length": 886.15,
174
+ "completions/mean_terminated_length": 886.15,
175
+ "completions/min_length": 9.0,
176
+ "completions/min_terminated_length": 9.0,
177
+ "epoch": 0.0008265980896399706,
178
+ "grad_norm": 1.8594898973134009,
179
+ "kl": 0.000946044921875,
180
+ "learning_rate": 8e-07,
181
+ "loss": -0.4133,
182
+ "num_tokens": 226519.0,
183
+ "reward": 0.17254838347434998,
184
+ "reward_std": 0.08416791260242462,
185
+ "rewards/format_reward_func": 0.6950000524520874,
186
+ "rewards/lcs_reward_func": 0.03354838490486145,
187
+ "step": 9
188
+ },
189
+ {
190
+ "clip_ratio/high_max": 0.0,
191
+ "clip_ratio/high_mean": 0.0,
192
+ "clip_ratio/low_mean": 0.0,
193
+ "clip_ratio/low_min": 0.0,
194
+ "clip_ratio/region_mean": 0.0,
195
+ "epoch": 0.0009184423218221896,
196
+ "grad_norm": 1.8465168294754166,
197
+ "kl": 0.0010614395141601562,
198
+ "learning_rate": 9e-07,
199
+ "loss": -0.4133,
200
+ "step": 10
201
+ },
202
+ {
203
+ "epoch": 0.0009184423218221896,
204
+ "eval_completions/max_length": 2004,
205
+ "eval_completions/mean_length": 771.96875,
206
+ "eval_completions/min_length": 33,
207
+ "eval_reward": 0.21109572052955627,
208
+ "eval_reward_std": 0.11327587813138962,
209
+ "step": 10
210
+ },
211
+ {
212
+ "clip_ratio/high_max": 0.0,
213
+ "clip_ratio/high_mean": 0.0,
214
+ "clip_ratio/low_mean": 0.0,
215
+ "clip_ratio/low_min": 0.0,
216
+ "clip_ratio/region_mean": 0.0,
217
+ "completions/clipped_ratio": 0.0,
218
+ "completions/max_length": 2027.0,
219
+ "completions/max_terminated_length": 2027.0,
220
+ "completions/mean_length": 780.475,
221
+ "completions/mean_terminated_length": 780.475,
222
+ "completions/min_length": 9.0,
223
+ "completions/min_terminated_length": 9.0,
224
+ "epoch": 0.0010102865540044085,
225
+ "grad_norm": 2.231378663403048,
226
+ "kl": 0.0013132095336914062,
227
+ "learning_rate": 1e-06,
228
+ "loss": -0.4691,
229
+ "num_tokens": 270268.0,
230
+ "reward": 0.18357333540916443,
231
+ "reward_std": 0.09895642846822739,
232
+ "rewards/format_reward_func": 0.6800000071525574,
233
+ "rewards/lcs_reward_func": 0.04757332801818848,
234
+ "step": 11
235
+ },
236
+ {
237
+ "clip_ratio/high_max": 0.0,
238
+ "clip_ratio/high_mean": 0.0,
239
+ "clip_ratio/low_mean": 0.0,
240
+ "clip_ratio/low_min": 0.0,
241
+ "clip_ratio/region_mean": 0.0,
242
+ "epoch": 0.0011021307861866275,
243
+ "grad_norm": 2.203346987626314,
244
+ "kl": 0.00273895263671875,
245
+ "learning_rate": 1e-06,
246
+ "loss": -0.4691,
247
+ "step": 12
248
+ },
249
+ {
250
+ "clip_ratio/high_max": 0.0,
251
+ "clip_ratio/high_mean": 0.0,
252
+ "clip_ratio/low_mean": 0.0,
253
+ "clip_ratio/low_min": 0.0,
254
+ "clip_ratio/region_mean": 0.0,
255
+ "completions/clipped_ratio": 0.0,
256
+ "completions/max_length": 2046.0,
257
+ "completions/max_terminated_length": 2046.0,
258
+ "completions/mean_length": 886.75,
259
+ "completions/mean_terminated_length": 886.75,
260
+ "completions/min_length": 9.0,
261
+ "completions/min_terminated_length": 9.0,
262
+ "epoch": 0.0011939750183688465,
263
+ "grad_norm": 1.722693626980397,
264
+ "kl": 0.0030231475830078125,
265
+ "learning_rate": 1e-06,
266
+ "loss": -0.4224,
267
+ "num_tokens": 318068.0,
268
+ "reward": 0.20494212210178375,
269
+ "reward_std": 0.09967484325170517,
270
+ "rewards/format_reward_func": 0.7099999785423279,
271
+ "rewards/lcs_reward_func": 0.06294212490320206,
272
+ "step": 13
273
+ },
274
+ {
275
+ "clip_ratio/high_max": 0.0,
276
+ "clip_ratio/high_mean": 0.0,
277
+ "clip_ratio/low_mean": 0.0,
278
+ "clip_ratio/low_min": 0.0,
279
+ "clip_ratio/region_mean": 0.0,
280
+ "epoch": 0.0012858192505510655,
281
+ "grad_norm": 1.71286565180573,
282
+ "kl": 0.004039764404296875,
283
+ "learning_rate": 1e-06,
284
+ "loss": -0.4224,
285
+ "step": 14
286
+ },
287
+ {
288
+ "clip_ratio/high_max": 0.0,
289
+ "clip_ratio/high_mean": 0.0,
290
+ "clip_ratio/low_mean": 0.0,
291
+ "clip_ratio/low_min": 0.0,
292
+ "clip_ratio/region_mean": 0.0,
293
+ "completions/clipped_ratio": 0.0,
294
+ "completions/max_length": 1850.0,
295
+ "completions/max_terminated_length": 1850.0,
296
+ "completions/mean_length": 810.525,
297
+ "completions/mean_terminated_length": 810.525,
298
+ "completions/min_length": 9.0,
299
+ "completions/min_terminated_length": 9.0,
300
+ "epoch": 0.0013776634827332844,
301
+ "grad_norm": 1.5003857131255895,
302
+ "kl": 0.008074760437011719,
303
+ "learning_rate": 1e-06,
304
+ "loss": -0.1635,
305
+ "num_tokens": 363389.0,
306
+ "reward": 0.2573193907737732,
307
+ "reward_std": 0.10465292632579803,
308
+ "rewards/format_reward_func": 0.8100000619888306,
309
+ "rewards/lcs_reward_func": 0.09531939774751663,
310
+ "step": 15
311
+ },
312
+ {
313
+ "clip_ratio/high_max": 0.0,
314
+ "clip_ratio/high_mean": 0.0,
315
+ "clip_ratio/low_mean": 0.0,
316
+ "clip_ratio/low_min": 0.0,
317
+ "clip_ratio/region_mean": 0.0,
318
+ "epoch": 0.0014695077149155032,
319
+ "grad_norm": 1.4613380324648746,
320
+ "kl": 0.02646923065185547,
321
+ "learning_rate": 1e-06,
322
+ "loss": -0.1635,
323
+ "step": 16
324
+ },
325
+ {
326
+ "clip_ratio/high_max": 0.0,
327
+ "clip_ratio/high_mean": 0.0,
328
+ "clip_ratio/low_mean": 0.0,
329
+ "clip_ratio/low_min": 0.0,
330
+ "clip_ratio/region_mean": 0.0,
331
+ "completions/clipped_ratio": 0.0,
332
+ "completions/max_length": 1739.0,
333
+ "completions/max_terminated_length": 1739.0,
334
+ "completions/mean_length": 812.625,
335
+ "completions/mean_terminated_length": 812.625,
336
+ "completions/min_length": 9.0,
337
+ "completions/min_terminated_length": 9.0,
338
+ "epoch": 0.0015613519470977222,
339
+ "grad_norm": 1.8041011323824336,
340
+ "kl": 0.05150604248046875,
341
+ "learning_rate": 1e-06,
342
+ "loss": -0.1445,
343
+ "num_tokens": 407234.0,
344
+ "reward": 0.2477889060974121,
345
+ "reward_std": 0.09424225986003876,
346
+ "rewards/format_reward_func": 0.76500004529953,
347
+ "rewards/lcs_reward_func": 0.09478890895843506,
348
+ "step": 17
349
+ },
350
+ {
351
+ "clip_ratio/high_max": 0.0,
352
+ "clip_ratio/high_mean": 0.0,
353
+ "clip_ratio/low_mean": 0.0,
354
+ "clip_ratio/low_min": 0.0,
355
+ "clip_ratio/region_mean": 0.0,
356
+ "epoch": 0.0016531961792799412,
357
+ "grad_norm": 1.8014091360654865,
358
+ "kl": 0.09454345703125,
359
+ "learning_rate": 1e-06,
360
+ "loss": -0.1444,
361
+ "step": 18
362
+ },
363
+ {
364
+ "clip_ratio/high_max": 0.0,
365
+ "clip_ratio/high_mean": 0.0,
366
+ "clip_ratio/low_mean": 0.0,
367
+ "clip_ratio/low_min": 0.0,
368
+ "clip_ratio/region_mean": 0.0,
369
+ "completions/clipped_ratio": 0.0,
370
+ "completions/max_length": 1855.0,
371
+ "completions/max_terminated_length": 1855.0,
372
+ "completions/mean_length": 662.15,
373
+ "completions/mean_terminated_length": 662.15,
374
+ "completions/min_length": 9.0,
375
+ "completions/min_terminated_length": 9.0,
376
+ "epoch": 0.0017450404114621601,
377
+ "grad_norm": 2.720704647358892,
378
+ "kl": 0.529296875,
379
+ "learning_rate": 1e-06,
380
+ "loss": -0.6313,
381
+ "num_tokens": 446840.0,
382
+ "reward": 0.1577414870262146,
383
+ "reward_std": 0.07978807389736176,
384
+ "rewards/format_reward_func": 0.6700000166893005,
385
+ "rewards/lcs_reward_func": 0.02374148927628994,
386
+ "step": 19
387
+ },
388
+ {
389
+ "clip_ratio/high_max": 0.0,
390
+ "clip_ratio/high_mean": 0.0,
391
+ "clip_ratio/low_mean": 0.0,
392
+ "clip_ratio/low_min": 0.0,
393
+ "clip_ratio/region_mean": 0.0,
394
+ "epoch": 0.0018368846436443791,
395
+ "grad_norm": 2.7053569037359777,
396
+ "kl": 0.82177734375,
397
+ "learning_rate": 1e-06,
398
+ "loss": -0.6311,
399
+ "step": 20
400
+ },
401
+ {
402
+ "epoch": 0.0018368846436443791,
403
+ "eval_completions/max_length": 2065,
404
+ "eval_completions/mean_length": 889.28125,
405
+ "eval_completions/min_length": 33,
406
+ "eval_reward": 0.2044806331396103,
407
+ "eval_reward_std": 0.09299344569444656,
408
+ "step": 20
409
+ },
410
+ {
411
+ "clip_ratio/high_max": 0.0,
412
+ "clip_ratio/high_mean": 0.0,
413
+ "clip_ratio/low_mean": 0.0,
414
+ "clip_ratio/low_min": 0.0,
415
+ "clip_ratio/region_mean": 0.0,
416
+ "completions/clipped_ratio": 0.0,
417
+ "completions/max_length": 2021.0,
418
+ "completions/max_terminated_length": 2021.0,
419
+ "completions/mean_length": 655.925,
420
+ "completions/mean_terminated_length": 655.925,
421
+ "completions/min_length": 9.0,
422
+ "completions/min_terminated_length": 9.0,
423
+ "epoch": 0.001928728875826598,
424
+ "grad_norm": 3.0706531919007047,
425
+ "kl": 1.38525390625,
426
+ "learning_rate": 1e-06,
427
+ "loss": -0.856,
428
+ "num_tokens": 485717.0,
429
+ "reward": 0.1469292938709259,
430
+ "reward_std": 0.10327912867069244,
431
+ "rewards/format_reward_func": 0.5800000429153442,
432
+ "rewards/lcs_reward_func": 0.03092929720878601,
433
+ "step": 21
434
+ },
435
+ {
436
+ "clip_ratio/high_max": 0.0,
437
+ "clip_ratio/high_mean": 0.0,
438
+ "clip_ratio/low_mean": 0.0,
439
+ "clip_ratio/low_min": 0.0,
440
+ "clip_ratio/region_mean": 0.0,
441
+ "epoch": 0.002020573108008817,
442
+ "grad_norm": 3.0607964501913987,
443
+ "kl": 1.6064453125,
444
+ "learning_rate": 1e-06,
445
+ "loss": -0.8558,
446
+ "step": 22
447
+ },
448
+ {
449
+ "clip_ratio/high_max": 0.0,
450
+ "clip_ratio/high_mean": 0.0,
451
+ "clip_ratio/low_mean": 0.0,
452
+ "clip_ratio/low_min": 0.0,
453
+ "clip_ratio/region_mean": 0.0,
454
+ "completions/clipped_ratio": 0.0,
455
+ "completions/max_length": 1959.0,
456
+ "completions/max_terminated_length": 1959.0,
457
+ "completions/mean_length": 863.425,
458
+ "completions/mean_terminated_length": 863.425,
459
+ "completions/min_length": 5.0,
460
+ "completions/min_terminated_length": 5.0,
461
+ "epoch": 0.002112417340191036,
462
+ "grad_norm": 1.4283474171915316,
463
+ "kl": 0.6814689636230469,
464
+ "learning_rate": 1e-06,
465
+ "loss": -0.4158,
466
+ "num_tokens": 532504.0,
467
+ "reward": 0.1960056722164154,
468
+ "reward_std": 0.07993875443935394,
469
+ "rewards/format_reward_func": 0.7649999856948853,
470
+ "rewards/lcs_reward_func": 0.04300567880272865,
471
+ "step": 23
472
+ },
473
+ {
474
+ "clip_ratio/high_max": 0.0,
475
+ "clip_ratio/high_mean": 0.0,
476
+ "clip_ratio/low_mean": 0.0,
477
+ "clip_ratio/low_min": 0.0,
478
+ "clip_ratio/region_mean": 0.0,
479
+ "epoch": 0.002204261572373255,
480
+ "grad_norm": 1.4416929903577083,
481
+ "kl": 1.0042381286621094,
482
+ "learning_rate": 1e-06,
483
+ "loss": -0.4154,
484
+ "step": 24
485
+ },
486
+ {
487
+ "clip_ratio/high_max": 0.0,
488
+ "clip_ratio/high_mean": 0.0,
489
+ "clip_ratio/low_mean": 0.0,
490
+ "clip_ratio/low_min": 0.0,
491
+ "clip_ratio/region_mean": 0.0,
492
+ "completions/clipped_ratio": 0.0,
493
+ "completions/max_length": 1924.0,
494
+ "completions/max_terminated_length": 1924.0,
495
+ "completions/mean_length": 890.625,
496
+ "completions/mean_terminated_length": 890.625,
497
+ "completions/min_length": 9.0,
498
+ "completions/min_terminated_length": 9.0,
499
+ "epoch": 0.002296105804555474,
500
+ "grad_norm": 1.6827770351938804,
501
+ "kl": 0.92138671875,
502
+ "learning_rate": 1e-06,
503
+ "loss": -0.2923,
504
+ "num_tokens": 580859.0,
505
+ "reward": 0.21194560825824738,
506
+ "reward_std": 0.0956936851143837,
507
+ "rewards/format_reward_func": 0.7800000309944153,
508
+ "rewards/lcs_reward_func": 0.05594560503959656,
509
+ "step": 25
510
+ },
511
+ {
512
+ "clip_ratio/high_max": 0.0,
513
+ "clip_ratio/high_mean": 0.0,
514
+ "clip_ratio/low_mean": 0.0,
515
+ "clip_ratio/low_min": 0.0,
516
+ "clip_ratio/region_mean": 0.0,
517
+ "epoch": 0.002387950036737693,
518
+ "grad_norm": 1.7236721170439555,
519
+ "kl": 1.6044921875,
520
+ "learning_rate": 1e-06,
521
+ "loss": -0.2916,
522
+ "step": 26
523
+ },
524
+ {
525
+ "clip_ratio/high_max": 0.0,
526
+ "clip_ratio/high_mean": 0.0,
527
+ "clip_ratio/low_mean": 0.0,
528
+ "clip_ratio/low_min": 0.0,
529
+ "clip_ratio/region_mean": 0.0,
530
+ "completions/clipped_ratio": 0.0,
531
+ "completions/max_length": 2044.0,
532
+ "completions/max_terminated_length": 2044.0,
533
+ "completions/mean_length": 916.55,
534
+ "completions/mean_terminated_length": 916.55,
535
+ "completions/min_length": 9.0,
536
+ "completions/min_terminated_length": 9.0,
537
+ "epoch": 0.002479794268919912,
538
+ "grad_norm": 4.19058051585696,
539
+ "kl": 5.715314865112305,
540
+ "learning_rate": 1e-06,
541
+ "loss": -0.2778,
542
+ "num_tokens": 629931.0,
543
+ "reward": 0.20700404047966003,
544
+ "reward_std": 0.09488054364919662,
545
+ "rewards/format_reward_func": 0.7200000286102295,
546
+ "rewards/lcs_reward_func": 0.0630040392279625,
547
+ "step": 27
548
+ },
549
+ {
550
+ "clip_ratio/high_max": 0.0,
551
+ "clip_ratio/high_mean": 0.0,
552
+ "clip_ratio/low_mean": 0.0,
553
+ "clip_ratio/low_min": 0.0,
554
+ "clip_ratio/region_mean": 0.0,
555
+ "epoch": 0.002571638501102131,
556
+ "grad_norm": 4.1982741479661625,
557
+ "kl": 6.281728744506836,
558
+ "learning_rate": 1e-06,
559
+ "loss": -0.2772,
560
+ "step": 28
561
+ },
562
+ {
563
+ "clip_ratio/high_max": 0.0,
564
+ "clip_ratio/high_mean": 0.0,
565
+ "clip_ratio/low_mean": 0.0,
566
+ "clip_ratio/low_min": 0.0,
567
+ "clip_ratio/region_mean": 0.0,
568
+ "completions/clipped_ratio": 0.0,
569
+ "completions/max_length": 1817.0,
570
+ "completions/max_terminated_length": 1817.0,
571
+ "completions/mean_length": 837.875,
572
+ "completions/mean_terminated_length": 837.875,
573
+ "completions/min_length": 9.0,
574
+ "completions/min_terminated_length": 9.0,
575
+ "epoch": 0.00266348273328435,
576
+ "grad_norm": 7.077824440439472,
577
+ "kl": 12.953125,
578
+ "learning_rate": 1e-06,
579
+ "loss": -0.3563,
580
+ "num_tokens": 677336.0,
581
+ "reward": 0.21692918241024017,
582
+ "reward_std": 0.10151036083698273,
583
+ "rewards/format_reward_func": 0.8100000619888306,
584
+ "rewards/lcs_reward_func": 0.05492917448282242,
585
+ "step": 29
586
+ },
587
+ {
588
+ "clip_ratio/high_max": 0.0,
589
+ "clip_ratio/high_mean": 0.0,
590
+ "clip_ratio/low_mean": 0.0,
591
+ "clip_ratio/low_min": 0.0,
592
+ "clip_ratio/region_mean": 0.0,
593
+ "epoch": 0.002755326965466569,
594
+ "grad_norm": 7.8992652256415425,
595
+ "kl": 14.34765625,
596
+ "learning_rate": 1e-06,
597
+ "loss": -0.355,
598
+ "step": 30
599
+ },
600
+ {
601
+ "epoch": 0.002755326965466569,
602
+ "eval_completions/max_length": 1941,
603
+ "eval_completions/mean_length": 1021.4375,
604
+ "eval_completions/min_length": 33,
605
+ "eval_reward": 0.21428458392620087,
606
+ "eval_reward_std": 0.07939482480287552,
607
+ "step": 30
608
+ },
609
+ {
610
+ "clip_ratio/high_max": 0.0,
611
+ "clip_ratio/high_mean": 0.0,
612
+ "clip_ratio/low_mean": 0.0,
613
+ "clip_ratio/low_min": 0.0,
614
+ "clip_ratio/region_mean": 0.0,
615
+ "completions/clipped_ratio": 0.0,
616
+ "completions/max_length": 2037.0,
617
+ "completions/max_terminated_length": 2037.0,
618
+ "completions/mean_length": 1040.2,
619
+ "completions/mean_terminated_length": 1040.2,
620
+ "completions/min_length": 9.0,
621
+ "completions/min_terminated_length": 9.0,
622
+ "epoch": 0.002847171197648788,
623
+ "grad_norm": 1.6919689671443403,
624
+ "kl": 2.2836875915527344,
625
+ "learning_rate": 1e-06,
626
+ "loss": -0.2171,
627
+ "num_tokens": 732334.0,
628
+ "reward": 0.2230643928050995,
629
+ "reward_std": 0.06873544305562973,
630
+ "rewards/format_reward_func": 0.8800000548362732,
631
+ "rewards/lcs_reward_func": 0.04706438630819321,
632
+ "step": 31
633
+ },
634
+ {
635
+ "clip_ratio/high_max": 0.0,
636
+ "clip_ratio/high_mean": 0.0,
637
+ "clip_ratio/low_mean": 0.0,
638
+ "clip_ratio/low_min": 0.0,
639
+ "clip_ratio/region_mean": 0.0,
640
+ "epoch": 0.0029390154298310064,
641
+ "grad_norm": 1.4919001050426044,
642
+ "kl": 2.106945037841797,
643
+ "learning_rate": 1e-06,
644
+ "loss": -0.2173,
645
+ "step": 32
646
+ },
647
+ {
648
+ "clip_ratio/high_max": 0.0,
649
+ "clip_ratio/high_mean": 0.0,
650
+ "clip_ratio/low_mean": 0.0,
651
+ "clip_ratio/low_min": 0.0,
652
+ "clip_ratio/region_mean": 0.0,
653
+ "completions/clipped_ratio": 0.0,
654
+ "completions/max_length": 1880.0,
655
+ "completions/max_terminated_length": 1880.0,
656
+ "completions/mean_length": 908.5,
657
+ "completions/mean_terminated_length": 908.5,
658
+ "completions/min_length": 9.0,
659
+ "completions/min_terminated_length": 9.0,
660
+ "epoch": 0.0030308596620132254,
661
+ "grad_norm": 1.5930416401398144,
662
+ "kl": 3.513671875,
663
+ "learning_rate": 1e-06,
664
+ "loss": -0.3184,
665
+ "num_tokens": 780784.0,
666
+ "reward": 0.19843518733978271,
667
+ "reward_std": 0.09002426266670227,
668
+ "rewards/format_reward_func": 0.7999999523162842,
669
+ "rewards/lcs_reward_func": 0.0384351909160614,
670
+ "step": 33
671
+ },
672
+ {
673
+ "clip_ratio/high_max": 0.0,
674
+ "clip_ratio/high_mean": 0.0,
675
+ "clip_ratio/low_mean": 0.0,
676
+ "clip_ratio/low_min": 0.0,
677
+ "clip_ratio/region_mean": 0.0,
678
+ "epoch": 0.0031227038941954444,
679
+ "grad_norm": 1.5242817608935546,
680
+ "kl": 3.2734375,
681
+ "learning_rate": 1e-06,
682
+ "loss": -0.3186,
683
+ "step": 34
684
+ },
685
+ {
686
+ "clip_ratio/high_max": 0.0,
687
+ "clip_ratio/high_mean": 0.0,
688
+ "clip_ratio/low_mean": 0.0,
689
+ "clip_ratio/low_min": 0.0,
690
+ "clip_ratio/region_mean": 0.0,
691
+ "completions/clipped_ratio": 0.0,
692
+ "completions/max_length": 1991.0,
693
+ "completions/max_terminated_length": 1991.0,
694
+ "completions/mean_length": 865.9,
695
+ "completions/mean_terminated_length": 865.9,
696
+ "completions/min_length": 9.0,
697
+ "completions/min_terminated_length": 9.0,
698
+ "epoch": 0.0032145481263776633,
699
+ "grad_norm": 1.6408098250505057,
700
+ "kl": 4.494140625,
701
+ "learning_rate": 1e-06,
702
+ "loss": -0.4562,
703
+ "num_tokens": 831850.0,
704
+ "reward": 0.1844530701637268,
705
+ "reward_std": 0.08404044806957245,
706
+ "rewards/format_reward_func": 0.7550000548362732,
707
+ "rewards/lcs_reward_func": 0.0334530845284462,
708
+ "step": 35
709
+ },
710
+ {
711
+ "clip_ratio/high_max": 0.0,
712
+ "clip_ratio/high_mean": 0.0,
713
+ "clip_ratio/low_mean": 0.0,
714
+ "clip_ratio/low_min": 0.0,
715
+ "clip_ratio/region_mean": 0.0,
716
+ "epoch": 0.0033063923585598823,
717
+ "grad_norm": 1.4204649760811243,
718
+ "kl": 4.013671875,
719
+ "learning_rate": 1e-06,
720
+ "loss": -0.4566,
721
+ "step": 36
722
+ },
723
+ {
724
+ "clip_ratio/high_max": 0.0,
725
+ "clip_ratio/high_mean": 0.0,
726
+ "clip_ratio/low_mean": 0.0,
727
+ "clip_ratio/low_min": 0.0,
728
+ "clip_ratio/region_mean": 0.0,
729
+ "completions/clipped_ratio": 0.0,
730
+ "completions/max_length": 2036.0,
731
+ "completions/max_terminated_length": 2036.0,
732
+ "completions/mean_length": 985.825,
733
+ "completions/mean_terminated_length": 985.825,
734
+ "completions/min_length": 9.0,
735
+ "completions/min_terminated_length": 9.0,
736
+ "epoch": 0.0033982365907421013,
737
+ "grad_norm": 1.435337921695266,
738
+ "kl": 3.529296875,
739
+ "learning_rate": 1e-06,
740
+ "loss": -0.4318,
741
+ "num_tokens": 886073.0,
742
+ "reward": 0.19459547102451324,
743
+ "reward_std": 0.09948933124542236,
744
+ "rewards/format_reward_func": 0.7700000405311584,
745
+ "rewards/lcs_reward_func": 0.04059547185897827,
746
+ "step": 37
747
+ },
748
+ {
749
+ "clip_ratio/high_max": 0.0,
750
+ "clip_ratio/high_mean": 0.0,
751
+ "clip_ratio/low_mean": 0.0,
752
+ "clip_ratio/low_min": 0.0,
753
+ "clip_ratio/region_mean": 0.0,
754
+ "epoch": 0.0034900808229243203,
755
+ "grad_norm": 1.5637950407978138,
756
+ "kl": 3.4580078125,
757
+ "learning_rate": 1e-06,
758
+ "loss": -0.4319,
759
+ "step": 38
760
+ },
761
+ {
762
+ "clip_ratio/high_max": 0.0,
763
+ "clip_ratio/high_mean": 0.0,
764
+ "clip_ratio/low_mean": 0.0,
765
+ "clip_ratio/low_min": 0.0,
766
+ "clip_ratio/region_mean": 0.0,
767
+ "completions/clipped_ratio": 0.0,
768
+ "completions/max_length": 1984.0,
769
+ "completions/max_terminated_length": 1984.0,
770
+ "completions/mean_length": 870.6,
771
+ "completions/mean_terminated_length": 870.6,
772
+ "completions/min_length": 9.0,
773
+ "completions/min_terminated_length": 9.0,
774
+ "epoch": 0.0035819250551065392,
775
+ "grad_norm": 1.3550377236404076,
776
+ "kl": 2.017578125,
777
+ "learning_rate": 1e-06,
778
+ "loss": -0.2469,
779
+ "num_tokens": 933597.0,
780
+ "reward": 0.21190586686134338,
781
+ "reward_std": 0.07402609288692474,
782
+ "rewards/format_reward_func": 0.8350000381469727,
783
+ "rewards/lcs_reward_func": 0.04490587115287781,
784
+ "step": 39
785
+ },
786
+ {
787
+ "clip_ratio/high_max": 0.0,
788
+ "clip_ratio/high_mean": 0.0,
789
+ "clip_ratio/low_mean": 0.0,
790
+ "clip_ratio/low_min": 0.0,
791
+ "clip_ratio/region_mean": 0.0,
792
+ "epoch": 0.0036737692872887582,
793
+ "grad_norm": 1.3403329530602544,
794
+ "kl": 2.021484375,
795
+ "learning_rate": 1e-06,
796
+ "loss": -0.2469,
797
+ "step": 40
798
+ },
799
+ {
800
+ "epoch": 0.0036737692872887582,
801
+ "eval_completions/max_length": 2028,
802
+ "eval_completions/mean_length": 869.28125,
803
+ "eval_completions/min_length": 33,
804
+ "eval_reward": 0.20918260514736176,
805
+ "eval_reward_std": 0.08579342067241669,
806
+ "step": 40
807
+ },
808
+ {
809
+ "clip_ratio/high_max": 0.0,
810
+ "clip_ratio/high_mean": 0.0,
811
+ "clip_ratio/low_mean": 0.0,
812
+ "clip_ratio/low_min": 0.0,
813
+ "clip_ratio/region_mean": 0.0,
814
+ "completions/clipped_ratio": 0.0,
815
+ "completions/max_length": 2026.0,
816
+ "completions/max_terminated_length": 2026.0,
817
+ "completions/mean_length": 965.575,
818
+ "completions/mean_terminated_length": 965.575,
819
+ "completions/min_length": 9.0,
820
+ "completions/min_terminated_length": 9.0,
821
+ "epoch": 0.003765613519470977,
822
+ "grad_norm": 1.486948705724367,
823
+ "kl": 3.025390625,
824
+ "learning_rate": 1e-06,
825
+ "loss": -0.3514,
826
+ "num_tokens": 984750.0,
827
+ "reward": 0.20043790340423584,
828
+ "reward_std": 0.07684105634689331,
829
+ "rewards/format_reward_func": 0.800000011920929,
830
+ "rewards/lcs_reward_func": 0.04043790325522423,
831
+ "step": 41
832
+ },
833
+ {
834
+ "clip_ratio/high_max": 0.0,
835
+ "clip_ratio/high_mean": 0.0,
836
+ "clip_ratio/low_mean": 0.0,
837
+ "clip_ratio/low_min": 0.0,
838
+ "clip_ratio/region_mean": 0.0,
839
+ "epoch": 0.003857457751653196,
840
+ "grad_norm": 1.5564521188030682,
841
+ "kl": 3.048828125,
842
+ "learning_rate": 1e-06,
843
+ "loss": -0.3514,
844
+ "step": 42
845
+ },
846
+ {
847
+ "clip_ratio/high_max": 0.0,
848
+ "clip_ratio/high_mean": 0.0,
849
+ "clip_ratio/low_mean": 0.0,
850
+ "clip_ratio/low_min": 0.0,
851
+ "clip_ratio/region_mean": 0.0,
852
+ "completions/clipped_ratio": 0.0,
853
+ "completions/max_length": 2023.0,
854
+ "completions/max_terminated_length": 2023.0,
855
+ "completions/mean_length": 962.35,
856
+ "completions/mean_terminated_length": 962.35,
857
+ "completions/min_length": 9.0,
858
+ "completions/min_terminated_length": 9.0,
859
+ "epoch": 0.003949301983835415,
860
+ "grad_norm": 1.0825099833801732,
861
+ "kl": 1.3337478637695312,
862
+ "learning_rate": 1e-06,
863
+ "loss": -0.2821,
864
+ "num_tokens": 1035704.0,
865
+ "reward": 0.23006513714790344,
866
+ "reward_std": 0.08291836082935333,
867
+ "rewards/format_reward_func": 0.8200001120567322,
868
+ "rewards/lcs_reward_func": 0.06606514751911163,
869
+ "step": 43
870
+ },
871
+ {
872
+ "clip_ratio/high_max": 0.0,
873
+ "clip_ratio/high_mean": 0.0,
874
+ "clip_ratio/low_mean": 0.0,
875
+ "clip_ratio/low_min": 0.0,
876
+ "clip_ratio/region_mean": 0.0,
877
+ "epoch": 0.004041146216017634,
878
+ "grad_norm": 1.0843188590306336,
879
+ "kl": 1.2898292541503906,
880
+ "learning_rate": 1e-06,
881
+ "loss": -0.2821,
882
+ "step": 44
883
+ },
884
+ {
885
+ "clip_ratio/high_max": 0.0,
886
+ "clip_ratio/high_mean": 0.0,
887
+ "clip_ratio/low_mean": 0.0,
888
+ "clip_ratio/low_min": 0.0,
889
+ "clip_ratio/region_mean": 0.0,
890
+ "completions/clipped_ratio": 0.0,
891
+ "completions/max_length": 1924.0,
892
+ "completions/max_terminated_length": 1924.0,
893
+ "completions/mean_length": 900.25,
894
+ "completions/mean_terminated_length": 900.25,
895
+ "completions/min_length": 9.0,
896
+ "completions/min_terminated_length": 9.0,
897
+ "epoch": 0.004132990448199853,
898
+ "grad_norm": 1.5379484313730982,
899
+ "kl": 3.1171875,
900
+ "learning_rate": 1e-06,
901
+ "loss": -0.4024,
902
+ "num_tokens": 1084314.0,
903
+ "reward": 0.20005948841571808,
904
+ "reward_std": 0.09878475964069366,
905
+ "rewards/format_reward_func": 0.7900000810623169,
906
+ "rewards/lcs_reward_func": 0.04205949231982231,
907
+ "step": 45
908
+ },
909
+ {
910
+ "clip_ratio/high_max": 0.0,
911
+ "clip_ratio/high_mean": 0.0,
912
+ "clip_ratio/low_mean": 0.0,
913
+ "clip_ratio/low_min": 0.0,
914
+ "clip_ratio/region_mean": 0.0,
915
+ "epoch": 0.004224834680382072,
916
+ "grad_norm": 1.5124606590890506,
917
+ "kl": 3.0166015625,
918
+ "learning_rate": 1e-06,
919
+ "loss": -0.4024,
920
+ "step": 46
921
+ },
922
+ {
923
+ "clip_ratio/high_max": 0.0,
924
+ "clip_ratio/high_mean": 0.0,
925
+ "clip_ratio/low_mean": 0.0,
926
+ "clip_ratio/low_min": 0.0,
927
+ "clip_ratio/region_mean": 0.0,
928
+ "completions/clipped_ratio": 0.0,
929
+ "completions/max_length": 1850.0,
930
+ "completions/max_terminated_length": 1850.0,
931
+ "completions/mean_length": 938.775,
932
+ "completions/mean_terminated_length": 938.775,
933
+ "completions/min_length": 9.0,
934
+ "completions/min_terminated_length": 9.0,
935
+ "epoch": 0.004316678912564291,
936
+ "grad_norm": 1.408416490591962,
937
+ "kl": 1.4979820251464844,
938
+ "learning_rate": 1e-06,
939
+ "loss": -0.2602,
940
+ "num_tokens": 1135535.0,
941
+ "reward": 0.2001209557056427,
942
+ "reward_std": 0.06422517448663712,
943
+ "rewards/format_reward_func": 0.8700000047683716,
944
+ "rewards/lcs_reward_func": 0.026120955124497414,
945
+ "step": 47
946
+ },
947
+ {
948
+ "clip_ratio/high_max": 0.0,
949
+ "clip_ratio/high_mean": 0.0,
950
+ "clip_ratio/low_mean": 0.0,
951
+ "clip_ratio/low_min": 0.0,
952
+ "clip_ratio/region_mean": 0.0,
953
+ "epoch": 0.00440852314474651,
954
+ "grad_norm": 1.3929253504743904,
955
+ "kl": 1.4970436096191406,
956
+ "learning_rate": 1e-06,
957
+ "loss": -0.2602,
958
+ "step": 48
959
+ },
960
+ {
961
+ "clip_ratio/high_max": 0.0,
962
+ "clip_ratio/high_mean": 0.0,
963
+ "clip_ratio/low_mean": 0.0,
964
+ "clip_ratio/low_min": 0.0,
965
+ "clip_ratio/region_mean": 0.0,
966
+ "completions/clipped_ratio": 0.0,
967
+ "completions/max_length": 1917.0,
968
+ "completions/max_terminated_length": 1917.0,
969
+ "completions/mean_length": 900.025,
970
+ "completions/mean_terminated_length": 900.025,
971
+ "completions/min_length": 9.0,
972
+ "completions/min_terminated_length": 9.0,
973
+ "epoch": 0.004500367376928729,
974
+ "grad_norm": 1.2289751162438687,
975
+ "kl": 2.583984375,
976
+ "learning_rate": 1e-06,
977
+ "loss": -0.4411,
978
+ "num_tokens": 1183336.0,
979
+ "reward": 0.1928170770406723,
980
+ "reward_std": 0.08593375235795975,
981
+ "rewards/format_reward_func": 0.8100000619888306,
982
+ "rewards/lcs_reward_func": 0.030817076563835144,
983
+ "step": 49
984
+ },
985
+ {
986
+ "clip_ratio/high_max": 0.0,
987
+ "clip_ratio/high_mean": 0.0,
988
+ "clip_ratio/low_mean": 0.0,
989
+ "clip_ratio/low_min": 0.0,
990
+ "clip_ratio/region_mean": 0.0,
991
+ "epoch": 0.004592211609110948,
992
+ "grad_norm": 1.2085154906618722,
993
+ "kl": 2.6875,
994
+ "learning_rate": 1e-06,
995
+ "loss": -0.441,
996
+ "step": 50
997
+ },
998
+ {
999
+ "epoch": 0.004592211609110948,
1000
+ "eval_completions/max_length": 1942,
1001
+ "eval_completions/mean_length": 898.71875,
1002
+ "eval_completions/min_length": 33,
1003
+ "eval_reward": 0.20390485227108002,
1004
+ "eval_reward_std": 0.07529351860284805,
1005
+ "step": 50
1006
+ },
1007
+ {
1008
+ "clip_ratio/high_max": 0.0,
1009
+ "clip_ratio/high_mean": 0.0,
1010
+ "clip_ratio/low_mean": 0.0,
1011
+ "clip_ratio/low_min": 0.0,
1012
+ "clip_ratio/region_mean": 0.0,
1013
+ "completions/clipped_ratio": 0.0,
1014
+ "completions/max_length": 1970.0,
1015
+ "completions/max_terminated_length": 1970.0,
1016
+ "completions/mean_length": 828.45,
1017
+ "completions/mean_terminated_length": 828.45,
1018
+ "completions/min_length": 9.0,
1019
+ "completions/min_terminated_length": 9.0,
1020
+ "epoch": 0.0046840558412931665,
1021
+ "grad_norm": 1.690419927569193,
1022
+ "kl": 3.8125,
1023
+ "learning_rate": 1e-06,
1024
+ "loss": -0.6029,
1025
+ "num_tokens": 1228914.0,
1026
+ "reward": 0.17726577818393707,
1027
+ "reward_std": 0.10230447351932526,
1028
+ "rewards/format_reward_func": 0.7300000190734863,
1029
+ "rewards/lcs_reward_func": 0.03126578405499458,
1030
+ "step": 51
1031
+ },
1032
+ {
1033
+ "clip_ratio/high_max": 0.0,
1034
+ "clip_ratio/high_mean": 0.0,
1035
+ "clip_ratio/low_mean": 0.0,
1036
+ "clip_ratio/low_min": 0.0,
1037
+ "clip_ratio/region_mean": 0.0,
1038
+ "epoch": 0.004775900073475386,
1039
+ "grad_norm": 1.6807220195714059,
1040
+ "kl": 4.25,
1041
+ "learning_rate": 1e-06,
1042
+ "loss": -0.6025,
1043
+ "step": 52
1044
+ },
1045
+ {
1046
+ "clip_ratio/high_max": 0.0,
1047
+ "clip_ratio/high_mean": 0.0,
1048
+ "clip_ratio/low_mean": 0.0,
1049
+ "clip_ratio/low_min": 0.0,
1050
+ "clip_ratio/region_mean": 0.0,
1051
+ "completions/clipped_ratio": 0.0,
1052
+ "completions/max_length": 2032.0,
1053
+ "completions/max_terminated_length": 2032.0,
1054
+ "completions/mean_length": 1138.725,
1055
+ "completions/mean_terminated_length": 1138.725,
1056
+ "completions/min_length": 9.0,
1057
+ "completions/min_terminated_length": 9.0,
1058
+ "epoch": 0.0048677443056576045,
1059
+ "grad_norm": 0.9847388426456154,
1060
+ "kl": 2.0333099365234375,
1061
+ "learning_rate": 1e-06,
1062
+ "loss": -0.3001,
1063
+ "num_tokens": 1290023.0,
1064
+ "reward": 0.21287302672863007,
1065
+ "reward_std": 0.07877419143915176,
1066
+ "rewards/format_reward_func": 0.8600000739097595,
1067
+ "rewards/lcs_reward_func": 0.04087302088737488,
1068
+ "step": 53
1069
+ },
1070
+ {
1071
+ "clip_ratio/high_max": 0.0,
1072
+ "clip_ratio/high_mean": 0.0,
1073
+ "clip_ratio/low_mean": 0.0,
1074
+ "clip_ratio/low_min": 0.0,
1075
+ "clip_ratio/region_mean": 0.0,
1076
+ "epoch": 0.004959588537839824,
1077
+ "grad_norm": 0.9700683418665813,
1078
+ "kl": 2.1544952392578125,
1079
+ "learning_rate": 1e-06,
1080
+ "loss": -0.2999,
1081
+ "step": 54
1082
+ },
1083
+ {
1084
+ "clip_ratio/high_max": 0.0,
1085
+ "clip_ratio/high_mean": 0.0,
1086
+ "clip_ratio/low_mean": 0.0,
1087
+ "clip_ratio/low_min": 0.0,
1088
+ "clip_ratio/region_mean": 0.0,
1089
+ "completions/clipped_ratio": 0.0,
1090
+ "completions/max_length": 1954.0,
1091
+ "completions/max_terminated_length": 1954.0,
1092
+ "completions/mean_length": 851.125,
1093
+ "completions/mean_terminated_length": 851.125,
1094
+ "completions/min_length": 9.0,
1095
+ "completions/min_terminated_length": 9.0,
1096
+ "epoch": 0.0050514327700220425,
1097
+ "grad_norm": 1.7938254223400716,
1098
+ "kl": 4.5559234619140625,
1099
+ "learning_rate": 1e-06,
1100
+ "loss": -0.2103,
1101
+ "num_tokens": 1338988.0,
1102
+ "reward": 0.2193053662776947,
1103
+ "reward_std": 0.07375349849462509,
1104
+ "rewards/format_reward_func": 0.8300000429153442,
1105
+ "rewards/lcs_reward_func": 0.05330537632107735,
1106
+ "step": 55
1107
+ },
1108
+ {
1109
+ "clip_ratio/high_max": 0.0,
1110
+ "clip_ratio/high_mean": 0.0,
1111
+ "clip_ratio/low_mean": 0.0,
1112
+ "clip_ratio/low_min": 0.0,
1113
+ "clip_ratio/region_mean": 0.0,
1114
+ "epoch": 0.005143277002204262,
1115
+ "grad_norm": 1.831632948103029,
1116
+ "kl": 4.60284423828125,
1117
+ "learning_rate": 1e-06,
1118
+ "loss": -0.2102,
1119
+ "step": 56
1120
+ },
1121
+ {
1122
+ "clip_ratio/high_max": 0.0,
1123
+ "clip_ratio/high_mean": 0.0,
1124
+ "clip_ratio/low_mean": 0.0,
1125
+ "clip_ratio/low_min": 0.0,
1126
+ "clip_ratio/region_mean": 0.0,
1127
+ "completions/clipped_ratio": 0.0,
1128
+ "completions/max_length": 1778.0,
1129
+ "completions/max_terminated_length": 1778.0,
1130
+ "completions/mean_length": 1001.375,
1131
+ "completions/mean_terminated_length": 1001.375,
1132
+ "completions/min_length": 9.0,
1133
+ "completions/min_terminated_length": 9.0,
1134
+ "epoch": 0.00523512123438648,
1135
+ "grad_norm": 2.3877586127598405,
1136
+ "kl": 4.5402374267578125,
1137
+ "learning_rate": 1e-06,
1138
+ "loss": -0.2191,
1139
+ "num_tokens": 1395263.0,
1140
+ "reward": 0.22619211673736572,
1141
+ "reward_std": 0.07270254194736481,
1142
+ "rewards/format_reward_func": 0.8700000047683716,
1143
+ "rewards/lcs_reward_func": 0.052192118018865585,
1144
+ "step": 57
1145
+ },
1146
+ {
1147
+ "clip_ratio/high_max": 0.0,
1148
+ "clip_ratio/high_mean": 0.0,
1149
+ "clip_ratio/low_mean": 0.0,
1150
+ "clip_ratio/low_min": 0.0,
1151
+ "clip_ratio/region_mean": 0.0,
1152
+ "epoch": 0.0053269654665687,
1153
+ "grad_norm": 2.1516149349049303,
1154
+ "kl": 4.290260314941406,
1155
+ "learning_rate": 1e-06,
1156
+ "loss": -0.2194,
1157
+ "step": 58
1158
+ },
1159
+ {
1160
+ "clip_ratio/high_max": 0.0,
1161
+ "clip_ratio/high_mean": 0.0,
1162
+ "clip_ratio/low_mean": 0.0,
1163
+ "clip_ratio/low_min": 0.0,
1164
+ "clip_ratio/region_mean": 0.0,
1165
+ "completions/clipped_ratio": 0.0,
1166
+ "completions/max_length": 2000.0,
1167
+ "completions/max_terminated_length": 2000.0,
1168
+ "completions/mean_length": 1056.075,
1169
+ "completions/mean_terminated_length": 1056.075,
1170
+ "completions/min_length": 9.0,
1171
+ "completions/min_terminated_length": 9.0,
1172
+ "epoch": 0.005418809698750918,
1173
+ "grad_norm": 1.218185892788451,
1174
+ "kl": 1.4466400146484375,
1175
+ "learning_rate": 1e-06,
1176
+ "loss": -0.1457,
1177
+ "num_tokens": 1452716.0,
1178
+ "reward": 0.21461419761180878,
1179
+ "reward_std": 0.052187662571668625,
1180
+ "rewards/format_reward_func": 0.9099999666213989,
1181
+ "rewards/lcs_reward_func": 0.032614197582006454,
1182
+ "step": 59
1183
+ },
1184
+ {
1185
+ "clip_ratio/high_max": 0.0,
1186
+ "clip_ratio/high_mean": 0.0,
1187
+ "clip_ratio/low_mean": 0.0,
1188
+ "clip_ratio/low_min": 0.0,
1189
+ "clip_ratio/region_mean": 0.0,
1190
+ "epoch": 0.005510653930933138,
1191
+ "grad_norm": 1.2033731421993872,
1192
+ "kl": 1.3060531616210938,
1193
+ "learning_rate": 1e-06,
1194
+ "loss": -0.1458,
1195
+ "step": 60
1196
+ },
1197
+ {
1198
+ "epoch": 0.005510653930933138,
1199
+ "eval_completions/max_length": 1941,
1200
+ "eval_completions/mean_length": 1006.75,
1201
+ "eval_completions/min_length": 33,
1202
+ "eval_reward": 0.23812003433704376,
1203
+ "eval_reward_std": 0.06293292343616486,
1204
+ "step": 60
1205
+ },
1206
+ {
1207
+ "clip_ratio/high_max": 0.0,
1208
+ "clip_ratio/high_mean": 0.0,
1209
+ "clip_ratio/low_mean": 0.0,
1210
+ "clip_ratio/low_min": 0.0,
1211
+ "clip_ratio/region_mean": 0.0,
1212
+ "completions/clipped_ratio": 0.0,
1213
+ "completions/max_length": 2013.0,
1214
+ "completions/max_terminated_length": 2013.0,
1215
+ "completions/mean_length": 1043.0,
1216
+ "completions/mean_terminated_length": 1043.0,
1217
+ "completions/min_length": 9.0,
1218
+ "completions/min_terminated_length": 9.0,
1219
+ "epoch": 0.005602498163115356,
1220
+ "grad_norm": 1.3991475236429036,
1221
+ "kl": 2.1066741943359375,
1222
+ "learning_rate": 1e-06,
1223
+ "loss": -0.1661,
1224
+ "num_tokens": 1506436.0,
1225
+ "reward": 0.2325197160243988,
1226
+ "reward_std": 0.07145832479000092,
1227
+ "rewards/format_reward_func": 0.8600000739097595,
1228
+ "rewards/lcs_reward_func": 0.06051970645785332,
1229
+ "step": 61
1230
+ },
1231
+ {
1232
+ "clip_ratio/high_max": 0.0,
1233
+ "clip_ratio/high_mean": 0.0,
1234
+ "clip_ratio/low_mean": 0.0,
1235
+ "clip_ratio/low_min": 0.0,
1236
+ "clip_ratio/region_mean": 0.0,
1237
+ "epoch": 0.005694342395297576,
1238
+ "grad_norm": 1.4079001953689518,
1239
+ "kl": 2.1243209838867188,
1240
+ "learning_rate": 1e-06,
1241
+ "loss": -0.1661,
1242
+ "step": 62
1243
+ },
1244
+ {
1245
+ "clip_ratio/high_max": 0.0,
1246
+ "clip_ratio/high_mean": 0.0,
1247
+ "clip_ratio/low_mean": 0.0,
1248
+ "clip_ratio/low_min": 0.0,
1249
+ "clip_ratio/region_mean": 0.0,
1250
+ "completions/clipped_ratio": 0.0,
1251
+ "completions/max_length": 1760.0,
1252
+ "completions/max_terminated_length": 1760.0,
1253
+ "completions/mean_length": 898.8,
1254
+ "completions/mean_terminated_length": 898.8,
1255
+ "completions/min_length": 9.0,
1256
+ "completions/min_terminated_length": 9.0,
1257
+ "epoch": 0.005786186627479794,
1258
+ "grad_norm": 1.0822550075119286,
1259
+ "kl": 3.12109375,
1260
+ "learning_rate": 1e-06,
1261
+ "loss": -0.3438,
1262
+ "num_tokens": 1556288.0,
1263
+ "reward": 0.2405313104391098,
1264
+ "reward_std": 0.09354240447282791,
1265
+ "rewards/format_reward_func": 0.8300000429153442,
1266
+ "rewards/lcs_reward_func": 0.07453129440546036,
1267
+ "step": 63
1268
+ },
1269
+ {
1270
+ "clip_ratio/high_max": 0.0,
1271
+ "clip_ratio/high_mean": 0.0,
1272
+ "clip_ratio/low_mean": 0.0,
1273
+ "clip_ratio/low_min": 0.0,
1274
+ "clip_ratio/region_mean": 0.0,
1275
+ "epoch": 0.005878030859662013,
1276
+ "grad_norm": 1.0584717578885592,
1277
+ "kl": 2.984375,
1278
+ "learning_rate": 1e-06,
1279
+ "loss": -0.344,
1280
+ "step": 64
1281
+ },
1282
+ {
1283
+ "clip_ratio/high_max": 0.0,
1284
+ "clip_ratio/high_mean": 0.0,
1285
+ "clip_ratio/low_mean": 0.0,
1286
+ "clip_ratio/low_min": 0.0,
1287
+ "clip_ratio/region_mean": 0.0,
1288
+ "completions/clipped_ratio": 0.0,
1289
+ "completions/max_length": 1431.0,
1290
+ "completions/max_terminated_length": 1431.0,
1291
+ "completions/mean_length": 741.8,
1292
+ "completions/mean_terminated_length": 741.8,
1293
+ "completions/min_length": 9.0,
1294
+ "completions/min_terminated_length": 9.0,
1295
+ "epoch": 0.005969875091844232,
1296
+ "grad_norm": 1.2789482228795541,
1297
+ "kl": 2.197265625,
1298
+ "learning_rate": 1e-06,
1299
+ "loss": -0.2848,
1300
+ "num_tokens": 1599990.0,
1301
+ "reward": 0.21671777963638306,
1302
+ "reward_std": 0.07907099276781082,
1303
+ "rewards/format_reward_func": 0.8600000739097595,
1304
+ "rewards/lcs_reward_func": 0.04471778869628906,
1305
+ "step": 65
1306
+ },
1307
+ {
1308
+ "clip_ratio/high_max": 0.0,
1309
+ "clip_ratio/high_mean": 0.0,
1310
+ "clip_ratio/low_mean": 0.0,
1311
+ "clip_ratio/low_min": 0.0,
1312
+ "clip_ratio/region_mean": 0.0,
1313
+ "epoch": 0.006061719324026451,
1314
+ "grad_norm": 1.266764856520884,
1315
+ "kl": 2.224609375,
1316
+ "learning_rate": 1e-06,
1317
+ "loss": -0.2848,
1318
+ "step": 66
1319
+ },
1320
+ {
1321
+ "clip_ratio/high_max": 0.0,
1322
+ "clip_ratio/high_mean": 0.0,
1323
+ "clip_ratio/low_mean": 0.0,
1324
+ "clip_ratio/low_min": 0.0,
1325
+ "clip_ratio/region_mean": 0.0,
1326
+ "completions/clipped_ratio": 0.0,
1327
+ "completions/max_length": 1968.0,
1328
+ "completions/max_terminated_length": 1968.0,
1329
+ "completions/mean_length": 1052.7,
1330
+ "completions/mean_terminated_length": 1052.7,
1331
+ "completions/min_length": 9.0,
1332
+ "completions/min_terminated_length": 9.0,
1333
+ "epoch": 0.00615356355620867,
1334
+ "grad_norm": 1.2688600523508209,
1335
+ "kl": 1.39453125,
1336
+ "learning_rate": 1e-06,
1337
+ "loss": -0.2192,
1338
+ "num_tokens": 1654308.0,
1339
+ "reward": 0.23257216811180115,
1340
+ "reward_std": 0.06556694209575653,
1341
+ "rewards/format_reward_func": 0.9000000357627869,
1342
+ "rewards/lcs_reward_func": 0.05257216840982437,
1343
+ "step": 67
1344
+ },
1345
+ {
1346
+ "clip_ratio/high_max": 0.0,
1347
+ "clip_ratio/high_mean": 0.0,
1348
+ "clip_ratio/low_mean": 0.0,
1349
+ "clip_ratio/low_min": 0.0,
1350
+ "clip_ratio/region_mean": 0.0,
1351
+ "epoch": 0.006245407788390889,
1352
+ "grad_norm": 1.2695224583429259,
1353
+ "kl": 1.43560791015625,
1354
+ "learning_rate": 1e-06,
1355
+ "loss": -0.2192,
1356
+ "step": 68
1357
+ },
1358
+ {
1359
+ "clip_ratio/high_max": 0.0,
1360
+ "clip_ratio/high_mean": 0.0,
1361
+ "clip_ratio/low_mean": 0.0,
1362
+ "clip_ratio/low_min": 0.0,
1363
+ "clip_ratio/region_mean": 0.0,
1364
+ "completions/clipped_ratio": 0.0,
1365
+ "completions/max_length": 1798.0,
1366
+ "completions/max_terminated_length": 1798.0,
1367
+ "completions/mean_length": 1001.15,
1368
+ "completions/mean_terminated_length": 1001.15,
1369
+ "completions/min_length": 9.0,
1370
+ "completions/min_terminated_length": 9.0,
1371
+ "epoch": 0.006337252020573108,
1372
+ "grad_norm": 1.4279803413786,
1373
+ "kl": 0.4173736572265625,
1374
+ "learning_rate": 1e-06,
1375
+ "loss": 0.0345,
1376
+ "num_tokens": 1707374.0,
1377
+ "reward": 0.2260584533214569,
1378
+ "reward_std": 0.04003484547138214,
1379
+ "rewards/format_reward_func": 0.9399999976158142,
1380
+ "rewards/lcs_reward_func": 0.03805844858288765,
1381
+ "step": 69
1382
+ },
1383
+ {
1384
+ "clip_ratio/high_max": 0.0,
1385
+ "clip_ratio/high_mean": 0.0,
1386
+ "clip_ratio/low_mean": 0.0,
1387
+ "clip_ratio/low_min": 0.0,
1388
+ "clip_ratio/region_mean": 0.0,
1389
+ "epoch": 0.006429096252755327,
1390
+ "grad_norm": 1.4245546877421398,
1391
+ "kl": 0.42337799072265625,
1392
+ "learning_rate": 1e-06,
1393
+ "loss": 0.0345,
1394
+ "step": 70
1395
+ },
1396
+ {
1397
+ "epoch": 0.006429096252755327,
1398
+ "eval_completions/max_length": 2045,
1399
+ "eval_completions/mean_length": 940.09375,
1400
+ "eval_completions/min_length": 33,
1401
+ "eval_reward": 0.21806684136390686,
1402
+ "eval_reward_std": 0.07097575068473816,
1403
+ "step": 70
1404
+ },
1405
+ {
1406
+ "clip_ratio/high_max": 0.0,
1407
+ "clip_ratio/high_mean": 0.0,
1408
+ "clip_ratio/low_mean": 0.0,
1409
+ "clip_ratio/low_min": 0.0,
1410
+ "clip_ratio/region_mean": 0.0,
1411
+ "completions/clipped_ratio": 0.0,
1412
+ "completions/max_length": 1891.0,
1413
+ "completions/max_terminated_length": 1891.0,
1414
+ "completions/mean_length": 833.775,
1415
+ "completions/mean_terminated_length": 833.775,
1416
+ "completions/min_length": 9.0,
1417
+ "completions/min_terminated_length": 9.0,
1418
+ "epoch": 0.006520940484937546,
1419
+ "grad_norm": 1.532083612743953,
1420
+ "kl": 2.4554443359375,
1421
+ "learning_rate": 1e-06,
1422
+ "loss": -0.103,
1423
+ "num_tokens": 1752925.0,
1424
+ "reward": 0.2318977415561676,
1425
+ "reward_std": 0.07339000701904297,
1426
+ "rewards/format_reward_func": 0.8600000739097595,
1427
+ "rewards/lcs_reward_func": 0.05989774689078331,
1428
+ "step": 71
1429
+ },
1430
+ {
1431
+ "clip_ratio/high_max": 0.0,
1432
+ "clip_ratio/high_mean": 0.0,
1433
+ "clip_ratio/low_mean": 0.0,
1434
+ "clip_ratio/low_min": 0.0,
1435
+ "clip_ratio/region_mean": 0.0,
1436
+ "epoch": 0.006612784717119765,
1437
+ "grad_norm": 1.5458953560334885,
1438
+ "kl": 2.7545013427734375,
1439
+ "learning_rate": 1e-06,
1440
+ "loss": -0.1027,
1441
+ "step": 72
1442
+ },
1443
+ {
1444
+ "clip_ratio/high_max": 0.0,
1445
+ "clip_ratio/high_mean": 0.0,
1446
+ "clip_ratio/low_mean": 0.0,
1447
+ "clip_ratio/low_min": 0.0,
1448
+ "clip_ratio/region_mean": 0.0,
1449
+ "completions/clipped_ratio": 0.0,
1450
+ "completions/max_length": 2015.0,
1451
+ "completions/max_terminated_length": 2015.0,
1452
+ "completions/mean_length": 955.875,
1453
+ "completions/mean_terminated_length": 955.875,
1454
+ "completions/min_length": 9.0,
1455
+ "completions/min_terminated_length": 9.0,
1456
+ "epoch": 0.006704628949301984,
1457
+ "grad_norm": 1.5966261563593132,
1458
+ "kl": 2.0414581298828125,
1459
+ "learning_rate": 1e-06,
1460
+ "loss": -0.1909,
1461
+ "num_tokens": 1805010.0,
1462
+ "reward": 0.21281859278678894,
1463
+ "reward_std": 0.05903353542089462,
1464
+ "rewards/format_reward_func": 0.9000000357627869,
1465
+ "rewards/lcs_reward_func": 0.03281859681010246,
1466
+ "step": 73
1467
+ },
1468
+ {
1469
+ "clip_ratio/high_max": 0.0,
1470
+ "clip_ratio/high_mean": 0.0,
1471
+ "clip_ratio/low_mean": 0.0,
1472
+ "clip_ratio/low_min": 0.0,
1473
+ "clip_ratio/region_mean": 0.0,
1474
+ "epoch": 0.006796473181484203,
1475
+ "grad_norm": 1.5913807529250394,
1476
+ "kl": 2.0258331298828125,
1477
+ "learning_rate": 1e-06,
1478
+ "loss": -0.1909,
1479
+ "step": 74
1480
+ },
1481
+ {
1482
+ "clip_ratio/high_max": 0.0,
1483
+ "clip_ratio/high_mean": 0.0,
1484
+ "clip_ratio/low_mean": 0.0,
1485
+ "clip_ratio/low_min": 0.0,
1486
+ "clip_ratio/region_mean": 0.0,
1487
+ "completions/clipped_ratio": 0.0,
1488
+ "completions/max_length": 1949.0,
1489
+ "completions/max_terminated_length": 1949.0,
1490
+ "completions/mean_length": 847.3,
1491
+ "completions/mean_terminated_length": 847.3,
1492
+ "completions/min_length": 9.0,
1493
+ "completions/min_terminated_length": 9.0,
1494
+ "epoch": 0.006888317413666422,
1495
+ "grad_norm": 1.8091604327594668,
1496
+ "kl": 1.5551300048828125,
1497
+ "learning_rate": 1e-06,
1498
+ "loss": -0.1276,
1499
+ "num_tokens": 1850422.0,
1500
+ "reward": 0.22308267652988434,
1501
+ "reward_std": 0.05692993104457855,
1502
+ "rewards/format_reward_func": 0.9099999666213989,
1503
+ "rewards/lcs_reward_func": 0.041082676500082016,
1504
+ "step": 75
1505
+ },
1506
+ {
1507
+ "clip_ratio/high_max": 0.0,
1508
+ "clip_ratio/high_mean": 0.0,
1509
+ "clip_ratio/low_mean": 0.0,
1510
+ "clip_ratio/low_min": 0.0,
1511
+ "clip_ratio/region_mean": 0.0,
1512
+ "epoch": 0.0069801616458486405,
1513
+ "grad_norm": 1.793313998835583,
1514
+ "kl": 1.5044097900390625,
1515
+ "learning_rate": 1e-06,
1516
+ "loss": -0.1276,
1517
+ "step": 76
1518
+ },
1519
+ {
1520
+ "clip_ratio/high_max": 0.0,
1521
+ "clip_ratio/high_mean": 0.0,
1522
+ "clip_ratio/low_mean": 0.0,
1523
+ "clip_ratio/low_min": 0.0,
1524
+ "clip_ratio/region_mean": 0.0,
1525
+ "completions/clipped_ratio": 0.0,
1526
+ "completions/max_length": 1626.0,
1527
+ "completions/max_terminated_length": 1626.0,
1528
+ "completions/mean_length": 817.675,
1529
+ "completions/mean_terminated_length": 817.675,
1530
+ "completions/min_length": 9.0,
1531
+ "completions/min_terminated_length": 9.0,
1532
+ "epoch": 0.00707200587803086,
1533
+ "grad_norm": 1.4652517264367153,
1534
+ "kl": 1.5932769775390625,
1535
+ "learning_rate": 1e-06,
1536
+ "loss": -0.1444,
1537
+ "num_tokens": 1893259.0,
1538
+ "reward": 0.2210662066936493,
1539
+ "reward_std": 0.048665329813957214,
1540
+ "rewards/format_reward_func": 0.9100000262260437,
1541
+ "rewards/lcs_reward_func": 0.039066214114427567,
1542
+ "step": 77
1543
+ },
1544
+ {
1545
+ "clip_ratio/high_max": 0.0,
1546
+ "clip_ratio/high_mean": 0.0,
1547
+ "clip_ratio/low_mean": 0.0,
1548
+ "clip_ratio/low_min": 0.0,
1549
+ "clip_ratio/region_mean": 0.0,
1550
+ "epoch": 0.0071638501102130785,
1551
+ "grad_norm": 1.4504944985152288,
1552
+ "kl": 1.54644775390625,
1553
+ "learning_rate": 1e-06,
1554
+ "loss": -0.1444,
1555
+ "step": 78
1556
+ },
1557
+ {
1558
+ "clip_ratio/high_max": 0.0,
1559
+ "clip_ratio/high_mean": 0.0,
1560
+ "clip_ratio/low_mean": 0.0,
1561
+ "clip_ratio/low_min": 0.0,
1562
+ "clip_ratio/region_mean": 0.0,
1563
+ "completions/clipped_ratio": 0.0,
1564
+ "completions/max_length": 2016.0,
1565
+ "completions/max_terminated_length": 2016.0,
1566
+ "completions/mean_length": 1031.425,
1567
+ "completions/mean_terminated_length": 1031.425,
1568
+ "completions/min_length": 9.0,
1569
+ "completions/min_terminated_length": 9.0,
1570
+ "epoch": 0.007255694342395298,
1571
+ "grad_norm": 1.49379469044819,
1572
+ "kl": 0.9149017333984375,
1573
+ "learning_rate": 1e-06,
1574
+ "loss": -0.075,
1575
+ "num_tokens": 1949706.0,
1576
+ "reward": 0.21732431650161743,
1577
+ "reward_std": 0.04848937690258026,
1578
+ "rewards/format_reward_func": 0.9200000166893005,
1579
+ "rewards/lcs_reward_func": 0.03332432731986046,
1580
+ "step": 79
1581
+ },
1582
+ {
1583
+ "clip_ratio/high_max": 0.0,
1584
+ "clip_ratio/high_mean": 0.0,
1585
+ "clip_ratio/low_mean": 0.0,
1586
+ "clip_ratio/low_min": 0.0,
1587
+ "clip_ratio/region_mean": 0.0,
1588
+ "epoch": 0.0073475385745775165,
1589
+ "grad_norm": 1.4689419152987504,
1590
+ "kl": 0.8819580078125,
1591
+ "learning_rate": 1e-06,
1592
+ "loss": -0.075,
1593
+ "step": 80
1594
+ },
1595
+ {
1596
+ "epoch": 0.0073475385745775165,
1597
+ "eval_completions/max_length": 1773,
1598
+ "eval_completions/mean_length": 1051.78125,
1599
+ "eval_completions/min_length": 33,
1600
+ "eval_reward": 0.23151257634162903,
1601
+ "eval_reward_std": 0.045354004949331284,
1602
+ "step": 80
1603
+ },
1604
+ {
1605
+ "clip_ratio/high_max": 0.0,
1606
+ "clip_ratio/high_mean": 0.0,
1607
+ "clip_ratio/low_mean": 0.0,
1608
+ "clip_ratio/low_min": 0.0,
1609
+ "clip_ratio/region_mean": 0.0,
1610
+ "completions/clipped_ratio": 0.0,
1611
+ "completions/max_length": 1878.0,
1612
+ "completions/max_terminated_length": 1878.0,
1613
+ "completions/mean_length": 920.725,
1614
+ "completions/mean_terminated_length": 920.725,
1615
+ "completions/min_length": 9.0,
1616
+ "completions/min_terminated_length": 9.0,
1617
+ "epoch": 0.007439382806759736,
1618
+ "grad_norm": 1.6000042069486635,
1619
+ "kl": 0.6986236572265625,
1620
+ "learning_rate": 1e-06,
1621
+ "loss": 0.0355,
1622
+ "num_tokens": 1998625.0,
1623
+ "reward": 0.23952794075012207,
1624
+ "reward_std": 0.03578774258494377,
1625
+ "rewards/format_reward_func": 0.9699999690055847,
1626
+ "rewards/lcs_reward_func": 0.04552793875336647,
1627
+ "step": 81
1628
+ },
1629
+ {
1630
+ "clip_ratio/high_max": 0.0,
1631
+ "clip_ratio/high_mean": 0.0,
1632
+ "clip_ratio/low_mean": 0.0,
1633
+ "clip_ratio/low_min": 0.0,
1634
+ "clip_ratio/region_mean": 0.0,
1635
+ "epoch": 0.007531227038941954,
1636
+ "grad_norm": 1.6048928271218026,
1637
+ "kl": 0.7575225830078125,
1638
+ "learning_rate": 1e-06,
1639
+ "loss": 0.0355,
1640
+ "step": 82
1641
+ },
1642
+ {
1643
+ "clip_ratio/high_max": 0.0,
1644
+ "clip_ratio/high_mean": 0.0,
1645
+ "clip_ratio/low_mean": 0.0,
1646
+ "clip_ratio/low_min": 0.0,
1647
+ "clip_ratio/region_mean": 0.0,
1648
+ "completions/clipped_ratio": 0.0,
1649
+ "completions/max_length": 1929.0,
1650
+ "completions/max_terminated_length": 1929.0,
1651
+ "completions/mean_length": 1051.1,
1652
+ "completions/mean_terminated_length": 1051.1,
1653
+ "completions/min_length": 473.0,
1654
+ "completions/min_terminated_length": 473.0,
1655
+ "epoch": 0.007623071271124174,
1656
+ "grad_norm": 1.6622202652089961,
1657
+ "kl": 0.0103607177734375,
1658
+ "learning_rate": 1e-06,
1659
+ "loss": -0.0052,
1660
+ "num_tokens": 2054249.0,
1661
+ "reward": 0.2544408440589905,
1662
+ "reward_std": 0.02876886911690235,
1663
+ "rewards/format_reward_func": 0.9850000739097595,
1664
+ "rewards/lcs_reward_func": 0.057440828531980515,
1665
+ "step": 83
1666
+ },
1667
+ {
1668
+ "clip_ratio/high_max": 0.0,
1669
+ "clip_ratio/high_mean": 0.0,
1670
+ "clip_ratio/low_mean": 0.0,
1671
+ "clip_ratio/low_min": 0.0,
1672
+ "clip_ratio/region_mean": 0.0,
1673
+ "epoch": 0.007714915503306392,
1674
+ "grad_norm": 1.6711405480859438,
1675
+ "kl": 0.010833740234375,
1676
+ "learning_rate": 1e-06,
1677
+ "loss": -0.0052,
1678
+ "step": 84
1679
+ },
1680
+ {
1681
+ "clip_ratio/high_max": 0.0,
1682
+ "clip_ratio/high_mean": 0.0,
1683
+ "clip_ratio/low_mean": 0.0,
1684
+ "clip_ratio/low_min": 0.0,
1685
+ "clip_ratio/region_mean": 0.0,
1686
+ "completions/clipped_ratio": 0.0,
1687
+ "completions/max_length": 1665.0,
1688
+ "completions/max_terminated_length": 1665.0,
1689
+ "completions/mean_length": 863.25,
1690
+ "completions/mean_terminated_length": 863.25,
1691
+ "completions/min_length": 9.0,
1692
+ "completions/min_terminated_length": 9.0,
1693
+ "epoch": 0.007806759735488611,
1694
+ "grad_norm": 1.5596144146559612,
1695
+ "kl": 1.1883697509765625,
1696
+ "learning_rate": 1e-06,
1697
+ "loss": -0.0611,
1698
+ "num_tokens": 2101179.0,
1699
+ "reward": 0.2613794803619385,
1700
+ "reward_std": 0.06008349731564522,
1701
+ "rewards/format_reward_func": 0.9399999976158142,
1702
+ "rewards/lcs_reward_func": 0.07337947934865952,
1703
+ "step": 85
1704
+ },
1705
+ {
1706
+ "clip_ratio/high_max": 0.0,
1707
+ "clip_ratio/high_mean": 0.0,
1708
+ "clip_ratio/low_mean": 0.0,
1709
+ "clip_ratio/low_min": 0.0,
1710
+ "clip_ratio/region_mean": 0.0,
1711
+ "epoch": 0.00789860396767083,
1712
+ "grad_norm": 1.5643789434736712,
1713
+ "kl": 1.21990966796875,
1714
+ "learning_rate": 1e-06,
1715
+ "loss": -0.0611,
1716
+ "step": 86
1717
+ },
1718
+ {
1719
+ "clip_ratio/high_max": 0.0,
1720
+ "clip_ratio/high_mean": 0.0,
1721
+ "clip_ratio/low_mean": 0.0,
1722
+ "clip_ratio/low_min": 0.0,
1723
+ "clip_ratio/region_mean": 0.0,
1724
+ "completions/clipped_ratio": 0.0,
1725
+ "completions/max_length": 1878.0,
1726
+ "completions/max_terminated_length": 1878.0,
1727
+ "completions/mean_length": 874.4,
1728
+ "completions/mean_terminated_length": 874.4,
1729
+ "completions/min_length": 9.0,
1730
+ "completions/min_terminated_length": 9.0,
1731
+ "epoch": 0.00799044819985305,
1732
+ "grad_norm": 1.7164853856370337,
1733
+ "kl": 2.06787109375,
1734
+ "learning_rate": 1e-06,
1735
+ "loss": -0.1248,
1736
+ "num_tokens": 2150155.0,
1737
+ "reward": 0.2113932967185974,
1738
+ "reward_std": 0.0450090691447258,
1739
+ "rewards/format_reward_func": 0.8949999809265137,
1740
+ "rewards/lcs_reward_func": 0.03239328786730766,
1741
+ "step": 87
1742
+ },
1743
+ {
1744
+ "clip_ratio/high_max": 0.0,
1745
+ "clip_ratio/high_mean": 0.0,
1746
+ "clip_ratio/low_mean": 0.0,
1747
+ "clip_ratio/low_min": 0.0,
1748
+ "clip_ratio/region_mean": 0.0,
1749
+ "epoch": 0.008082292432035268,
1750
+ "grad_norm": 1.6836941181967466,
1751
+ "kl": 1.8844757080078125,
1752
+ "learning_rate": 1e-06,
1753
+ "loss": -0.125,
1754
+ "step": 88
1755
+ },
1756
+ {
1757
+ "clip_ratio/high_max": 0.0,
1758
+ "clip_ratio/high_mean": 0.0,
1759
+ "clip_ratio/low_mean": 0.0,
1760
+ "clip_ratio/low_min": 0.0,
1761
+ "clip_ratio/region_mean": 0.0,
1762
+ "completions/clipped_ratio": 0.0,
1763
+ "completions/max_length": 1801.0,
1764
+ "completions/max_terminated_length": 1801.0,
1765
+ "completions/mean_length": 856.1,
1766
+ "completions/mean_terminated_length": 856.1,
1767
+ "completions/min_length": 9.0,
1768
+ "completions/min_terminated_length": 9.0,
1769
+ "epoch": 0.008174136664217487,
1770
+ "grad_norm": 1.6060443906303505,
1771
+ "kl": 0.6384735107421875,
1772
+ "learning_rate": 1e-06,
1773
+ "loss": 0.0493,
1774
+ "num_tokens": 2198719.0,
1775
+ "reward": 0.23078730702400208,
1776
+ "reward_std": 0.030345093458890915,
1777
+ "rewards/format_reward_func": 0.9699999690055847,
1778
+ "rewards/lcs_reward_func": 0.036787305027246475,
1779
+ "step": 89
1780
+ },
1781
+ {
1782
+ "clip_ratio/high_max": 0.0,
1783
+ "clip_ratio/high_mean": 0.0,
1784
+ "clip_ratio/low_mean": 0.0,
1785
+ "clip_ratio/low_min": 0.0,
1786
+ "clip_ratio/region_mean": 0.0,
1787
+ "epoch": 0.008265980896399705,
1788
+ "grad_norm": 1.6107603772408245,
1789
+ "kl": 0.6232452392578125,
1790
+ "learning_rate": 1e-06,
1791
+ "loss": 0.0493,
1792
+ "step": 90
1793
+ },
1794
+ {
1795
+ "epoch": 0.008265980896399705,
1796
+ "eval_completions/max_length": 1363,
1797
+ "eval_completions/mean_length": 814.8125,
1798
+ "eval_completions/min_length": 33,
1799
+ "eval_reward": 0.24562440812587738,
1800
+ "eval_reward_std": 0.07713833451271057,
1801
+ "step": 90
1802
+ },
1803
+ {
1804
+ "clip_ratio/high_max": 0.0,
1805
+ "clip_ratio/high_mean": 0.0,
1806
+ "clip_ratio/low_mean": 0.0,
1807
+ "clip_ratio/low_min": 0.0,
1808
+ "clip_ratio/region_mean": 0.0,
1809
+ "completions/clipped_ratio": 0.0,
1810
+ "completions/max_length": 1551.0,
1811
+ "completions/max_terminated_length": 1551.0,
1812
+ "completions/mean_length": 899.9,
1813
+ "completions/mean_terminated_length": 899.9,
1814
+ "completions/min_length": 9.0,
1815
+ "completions/min_terminated_length": 9.0,
1816
+ "epoch": 0.008357825128581926,
1817
+ "grad_norm": 1.66604613571463,
1818
+ "kl": 0.5090789794921875,
1819
+ "learning_rate": 1e-06,
1820
+ "loss": 0.0378,
1821
+ "num_tokens": 2245695.0,
1822
+ "reward": 0.24726460874080658,
1823
+ "reward_std": 0.036231789737939835,
1824
+ "rewards/format_reward_func": 0.9699999690055847,
1825
+ "rewards/lcs_reward_func": 0.05326459929347038,
1826
+ "step": 91
1827
+ },
1828
+ {
1829
+ "clip_ratio/high_max": 0.0,
1830
+ "clip_ratio/high_mean": 0.0,
1831
+ "clip_ratio/low_mean": 0.0,
1832
+ "clip_ratio/low_min": 0.0,
1833
+ "clip_ratio/region_mean": 0.0,
1834
+ "epoch": 0.008449669360764144,
1835
+ "grad_norm": 1.6507974800991143,
1836
+ "kl": 0.517242431640625,
1837
+ "learning_rate": 1e-06,
1838
+ "loss": 0.0378,
1839
+ "step": 92
1840
+ },
1841
+ {
1842
+ "clip_ratio/high_max": 0.0,
1843
+ "clip_ratio/high_mean": 0.0,
1844
+ "clip_ratio/low_mean": 0.0,
1845
+ "clip_ratio/low_min": 0.0,
1846
+ "clip_ratio/region_mean": 0.0,
1847
+ "completions/clipped_ratio": 0.0,
1848
+ "completions/max_length": 1269.0,
1849
+ "completions/max_terminated_length": 1269.0,
1850
+ "completions/mean_length": 749.15,
1851
+ "completions/mean_terminated_length": 749.15,
1852
+ "completions/min_length": 9.0,
1853
+ "completions/min_terminated_length": 9.0,
1854
+ "epoch": 0.008541513592946363,
1855
+ "grad_norm": 1.9377993694564883,
1856
+ "kl": 0.8455352783203125,
1857
+ "learning_rate": 1e-06,
1858
+ "loss": -0.0226,
1859
+ "num_tokens": 2286321.0,
1860
+ "reward": 0.2655041515827179,
1861
+ "reward_std": 0.06015206128358841,
1862
+ "rewards/format_reward_func": 0.9699999690055847,
1863
+ "rewards/lcs_reward_func": 0.0715041384100914,
1864
+ "step": 93
1865
+ },
1866
+ {
1867
+ "clip_ratio/high_max": 0.0,
1868
+ "clip_ratio/high_mean": 0.0,
1869
+ "clip_ratio/low_mean": 0.0,
1870
+ "clip_ratio/low_min": 0.0,
1871
+ "clip_ratio/region_mean": 0.0,
1872
+ "epoch": 0.008633357825128581,
1873
+ "grad_norm": 1.9289812213800934,
1874
+ "kl": 0.7676544189453125,
1875
+ "learning_rate": 1e-06,
1876
+ "loss": -0.0227,
1877
+ "step": 94
1878
+ },
1879
+ {
1880
+ "clip_ratio/high_max": 0.0,
1881
+ "clip_ratio/high_mean": 0.0,
1882
+ "clip_ratio/low_mean": 0.0,
1883
+ "clip_ratio/low_min": 0.0,
1884
+ "clip_ratio/region_mean": 0.0,
1885
+ "completions/clipped_ratio": 0.0,
1886
+ "completions/max_length": 1792.0,
1887
+ "completions/max_terminated_length": 1792.0,
1888
+ "completions/mean_length": 887.275,
1889
+ "completions/mean_terminated_length": 887.275,
1890
+ "completions/min_length": 9.0,
1891
+ "completions/min_terminated_length": 9.0,
1892
+ "epoch": 0.008725202057310802,
1893
+ "grad_norm": 1.5084325888205004,
1894
+ "kl": 1.1592254638671875,
1895
+ "learning_rate": 1e-06,
1896
+ "loss": -0.034,
1897
+ "num_tokens": 2333032.0,
1898
+ "reward": 0.29208117723464966,
1899
+ "reward_std": 0.07356677949428558,
1900
+ "rewards/format_reward_func": 0.9449999928474426,
1901
+ "rewards/lcs_reward_func": 0.10308118164539337,
1902
+ "step": 95
1903
+ },
1904
+ {
1905
+ "clip_ratio/high_max": 0.0,
1906
+ "clip_ratio/high_mean": 0.0,
1907
+ "clip_ratio/low_mean": 0.0,
1908
+ "clip_ratio/low_min": 0.0,
1909
+ "clip_ratio/region_mean": 0.0,
1910
+ "epoch": 0.00881704628949302,
1911
+ "grad_norm": 1.5038288881177668,
1912
+ "kl": 1.1751251220703125,
1913
+ "learning_rate": 1e-06,
1914
+ "loss": -0.0339,
1915
+ "step": 96
1916
+ },
1917
+ {
1918
+ "clip_ratio/high_max": 0.0,
1919
+ "clip_ratio/high_mean": 0.0,
1920
+ "clip_ratio/low_mean": 0.0,
1921
+ "clip_ratio/low_min": 0.0,
1922
+ "clip_ratio/region_mean": 0.0,
1923
+ "completions/clipped_ratio": 0.0,
1924
+ "completions/max_length": 1859.0,
1925
+ "completions/max_terminated_length": 1859.0,
1926
+ "completions/mean_length": 905.975,
1927
+ "completions/mean_terminated_length": 905.975,
1928
+ "completions/min_length": 9.0,
1929
+ "completions/min_terminated_length": 9.0,
1930
+ "epoch": 0.008908890521675239,
1931
+ "grad_norm": 1.6650831756792426,
1932
+ "kl": 1.7579498291015625,
1933
+ "learning_rate": 1e-06,
1934
+ "loss": -0.1085,
1935
+ "num_tokens": 2385061.0,
1936
+ "reward": 0.24999132752418518,
1937
+ "reward_std": 0.0621689110994339,
1938
+ "rewards/format_reward_func": 0.9149999618530273,
1939
+ "rewards/lcs_reward_func": 0.06699130684137344,
1940
+ "step": 97
1941
+ },
1942
+ {
1943
+ "clip_ratio/high_max": 0.0,
1944
+ "clip_ratio/high_mean": 0.0,
1945
+ "clip_ratio/low_mean": 0.0,
1946
+ "clip_ratio/low_min": 0.0,
1947
+ "clip_ratio/region_mean": 0.0,
1948
+ "epoch": 0.009000734753857457,
1949
+ "grad_norm": 1.6701005438071024,
1950
+ "kl": 1.7736968994140625,
1951
+ "learning_rate": 1e-06,
1952
+ "loss": -0.1085,
1953
+ "step": 98
1954
+ },
1955
+ {
1956
+ "clip_ratio/high_max": 0.0,
1957
+ "clip_ratio/high_mean": 0.0,
1958
+ "clip_ratio/low_mean": 0.0,
1959
+ "clip_ratio/low_min": 0.0,
1960
+ "clip_ratio/region_mean": 0.0,
1961
+ "completions/clipped_ratio": 0.0,
1962
+ "completions/max_length": 1999.0,
1963
+ "completions/max_terminated_length": 1999.0,
1964
+ "completions/mean_length": 931.1,
1965
+ "completions/mean_terminated_length": 931.1,
1966
+ "completions/min_length": 9.0,
1967
+ "completions/min_terminated_length": 9.0,
1968
+ "epoch": 0.009092578986039677,
1969
+ "grad_norm": 1.1066387280360448,
1970
+ "kl": 1.6706085205078125,
1971
+ "learning_rate": 1e-06,
1972
+ "loss": -0.1735,
1973
+ "num_tokens": 2433405.0,
1974
+ "reward": 0.2186073362827301,
1975
+ "reward_std": 0.05433933064341545,
1976
+ "rewards/format_reward_func": 0.9300000071525574,
1977
+ "rewards/lcs_reward_func": 0.03260732442140579,
1978
+ "step": 99
1979
+ },
1980
+ {
1981
+ "clip_ratio/high_max": 0.0,
1982
+ "clip_ratio/high_mean": 0.0,
1983
+ "clip_ratio/low_mean": 0.0,
1984
+ "clip_ratio/low_min": 0.0,
1985
+ "clip_ratio/region_mean": 0.0,
1986
+ "epoch": 0.009184423218221896,
1987
+ "grad_norm": 1.1270276768467666,
1988
+ "kl": 1.70977783203125,
1989
+ "learning_rate": 1e-06,
1990
+ "loss": -0.1734,
1991
+ "step": 100
1992
+ },
1993
+ {
1994
+ "epoch": 0.009184423218221896,
1995
+ "eval_completions/max_length": 2003,
1996
+ "eval_completions/mean_length": 947.28125,
1997
+ "eval_completions/min_length": 476,
1998
+ "eval_reward": 0.24798785150051117,
1999
+ "eval_reward_std": 0.04357936605811119,
2000
+ "step": 100
2001
+ }
2002
+ ],
2003
+ "logging_steps": 1,
2004
+ "max_steps": 100,
2005
+ "num_input_tokens_seen": 2433405,
2006
+ "num_train_epochs": 1,
2007
+ "save_steps": 500,
2008
+ "stateful_callbacks": {
2009
+ "TrainerControl": {
2010
+ "args": {
2011
+ "should_epoch_stop": false,
2012
+ "should_evaluate": true,
2013
+ "should_log": false,
2014
+ "should_save": true,
2015
+ "should_training_stop": true
2016
+ },
2017
+ "attributes": {}
2018
+ }
2019
+ },
2020
+ "total_flos": 0.0,
2021
+ "train_batch_size": 10,
2022
+ "trial_name": null,
2023
+ "trial_params": null
2024
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc45b0645253f482631e53cd1594bba52bd3ebbc8216b76453d0133834e87dfb
3
+ size 7800
vocab.json ADDED
The diff for this file is too large to render. See raw diff