AltinAziziNovomind commited on
Commit
300f940
·
verified ·
1 Parent(s): 608d179

Upload 9 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "swiss-ai/Apertus-8B-Instruct-2509",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "down_proj",
34
+ "gate_proj",
35
+ "k_proj",
36
+ "q_proj",
37
+ "o_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8ad81fb75479b68f77a0e5343c7a843689c089b6169568cc6113b0738cb0ee0
3
+ size 637586160
chat_template.jinja ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
2
+ {%- if param_spec.type == "array" -%}
3
+ {%- if param_spec['items'] -%}
4
+ {%- if param_spec['items']['type'] == "string" -%}
5
+ {{- "string[]" }}
6
+ {%- elif param_spec['items']['type'] == "number" -%}
7
+ {{- "number[]" }}
8
+ {%- elif param_spec['items']['type'] == "integer" -%}
9
+ {{- "number[]" }}
10
+ {%- elif param_spec['items']['type'] == "boolean" -%}
11
+ {{- "boolean[]" }}
12
+ {%- else -%}
13
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
14
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
15
+ {{- "any[]" }}
16
+ {%- else -%}
17
+ {{- inner_type + "[]" }}
18
+ {%- endif -%}
19
+ {%- endif -%}
20
+ {%- if param_spec.nullable -%}
21
+ {{- " | null" }}
22
+ {%- endif -%}
23
+ {%- else -%}
24
+ {{- "any[]" }}
25
+ {%- if param_spec.nullable -%}
26
+ {{- " | null" }}
27
+ {%- endif -%}
28
+ {%- endif -%}
29
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
30
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
31
+ {%- if param_spec.type | length > 1 -%}
32
+ {{- param_spec.type | join(" | ") }}
33
+ {%- else -%}
34
+ {{- param_spec.type[0] }}
35
+ {%- endif -%}
36
+ {%- elif param_spec.oneOf -%}
37
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
38
+ {%- set has_object_variants = false -%}
39
+ {%- for variant in param_spec.oneOf -%}
40
+ {%- if variant.type == "object" -%}
41
+ {%- set has_object_variants = true -%}
42
+ {%- endif -%}
43
+ {%- endfor -%}
44
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
45
+ {{- "any" }}
46
+ {%- else -%}
47
+ {%- for variant in param_spec.oneOf -%}
48
+ {{- render_typescript_type(variant, required_params) -}}
49
+ {%- if variant.description %}
50
+ {{- "// " + variant.description }}
51
+ {%- endif -%}
52
+ {%- if variant.default is defined %}
53
+ {{ "// default: " + variant.default|tojson }}
54
+ {%- endif -%}
55
+ {%- if not loop.last %}
56
+ {{- " | " }}
57
+ {% endif -%}
58
+ {%- endfor -%}
59
+ {%- endif -%}
60
+ {%- elif param_spec.type == "string" -%}
61
+ {%- if param_spec.enum -%}
62
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
63
+ {%- else -%}
64
+ {{- "string" }}
65
+ {%- if param_spec.nullable %}
66
+ {{- " | null" }}
67
+ {%- endif -%}
68
+ {%- endif -%}
69
+ {%- elif param_spec.type == "number" -%}
70
+ {{- "number" }}
71
+ {%- elif param_spec.type == "integer" -%}
72
+ {{- "number" }}
73
+ {%- elif param_spec.type == "boolean" -%}
74
+ {{- "boolean" }}
75
+ {%- elif param_spec.type == "object" -%}
76
+ {%- if param_spec.properties -%}
77
+ {{- "{\n" }}
78
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
79
+ {{- prop_name -}}
80
+ {%- if prop_name not in (param_spec.required or []) -%}
81
+ {{- "?" }}
82
+ {%- endif -%}
83
+ {{- ": " }}
84
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
85
+ {%- if not loop.last -%}
86
+ {{-", " }}
87
+ {%- endif -%}
88
+ {%- endfor -%}
89
+ {{- "}" }}
90
+ {%- else -%}
91
+ {{- "object" }}
92
+ {%- endif -%}
93
+ {%- else -%}
94
+ {{- "any" }}
95
+ {%- endif -%}
96
+ {%- endmacro -%}
97
+
98
+ {%- macro render_tools(tools) -%}
99
+ {%- for tool in tools %}
100
+ {{- "// " + tool.description + "\n" }}
101
+ {{- "type "+ tool.name + " = " }}
102
+ {%- if tool.parameters and tool.parameters.properties %}
103
+ {{- "(_: {\n" }}
104
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
105
+ {%- if param_spec.description %}
106
+ {{- "// " + param_spec.description + "\n" }}
107
+ {%- endif %}
108
+ {{- param_name }}
109
+ {%- if param_name not in (tool.parameters.required or []) -%}
110
+ {{- "?" }}
111
+ {%- endif -%}
112
+ {{- ": " }}
113
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
114
+ {%- if param_spec.default is defined -%}
115
+ {%- if param_spec.enum %}
116
+ {{- ", // default: " + param_spec.default }}
117
+ {%- elif param_spec.oneOf %}
118
+ {{- "// default: " + param_spec.default }}
119
+ {%- else %}
120
+ {{- ", // default: " + param_spec.default|tojson }}
121
+ {%- endif -%}
122
+ {%- endif -%}
123
+ {%- if not loop.last %}
124
+ {{- ",\n" }}
125
+ {%- else %}
126
+ {{- "\n" }}
127
+ {%- endif -%}
128
+ {%- endfor %}
129
+ {{- "}) => any;" }}
130
+ {%- else -%}
131
+ {{- "() => any;" }}
132
+ {%- endif -%}
133
+ {%- if not loop.last -%}
134
+ {{- "\n" }}
135
+ {%- endif -%}
136
+ {%- endfor %}
137
+ {%- endmacro -%}
138
+
139
+ {{ bos_token }}
140
+
141
+ {%- set system_token = '<|system_start|>' -%}
142
+ {%- set end_system_token = '<|system_end|>' -%}
143
+ {%- set developer_token = '<|developer_start|>' -%}
144
+ {%- set end_developer_token = '<|developer_end|>' -%}
145
+ {%- set user_token = '<|user_start|>' -%}
146
+ {%- set end_user_token = '<|user_end|>' -%}
147
+ {%- set assistant_token = '<|assistant_start|>' -%}
148
+ {%- set end_assistant_token = '<|assistant_end|>' -%}
149
+ {%- set inner_token = '<|inner_prefix|>' -%}
150
+ {%- set outer_token = '<|inner_suffix|>' -%}
151
+ {%- set tool_calls_token = '<|tools_prefix|>' -%}
152
+ {%- set end_tool_calls_token = '<|tools_suffix|>' -%}
153
+ {%- set image_token = '<|image|>' -%}
154
+
155
+ {%- set ns = namespace(in_assistant=false, in_tool=false, in_inner=false, waiting_for_tool_outputs=false, assistant_format=none) -%}
156
+
157
+ {%- if messages and messages[0].role == 'system' -%}
158
+ {%- if "content" in messages[0] -%}
159
+ {%- if messages[0].content is string -%}
160
+ {{ system_token + messages[0].content + end_system_token }}
161
+ {%- elif messages[0].content is mapping and "text" in messages[0].content -%}
162
+ {{ system_token + messages[0].content.text + end_system_token }}
163
+ {%- else -%}
164
+ {{- raise_exception("Invalid system message") -}}
165
+ {%- endif -%}
166
+ {%- else -%}
167
+ {{- raise_exception("Invalid system message") -}}
168
+ {%- endif -%}
169
+ {%- set loop_messages = messages[1:] -%}
170
+ {%- else -%}
171
+ {{ system_token + 'You are Apertus, a helpful assistant created by the SwissAI initiative.\nKnowledge cutoff: 2024-04\nCurrent date: ' + strftime_now('%Y-%m-%d') + end_system_token }}
172
+ {%- set loop_messages = messages -%}
173
+ {%- endif -%}
174
+
175
+ {{ developer_token + 'Deliberation: ' }}
176
+ {%- if enable_thinking is defined and enable_thinking -%}
177
+ {{ 'enabled\n' }}
178
+ {%- else -%}
179
+ {{ 'disabled\n' }}
180
+ {%- endif -%}
181
+ {%- if tools is defined and tools -%}
182
+ {{ 'Tool Capabilities:\n' + render_tools(tools) }}
183
+ {%- else -%}
184
+ {{ 'Tool Capabilities: disabled' }}
185
+ {%- endif -%}
186
+ {{ end_developer_token }}
187
+
188
+ {%- for message in loop_messages -%}
189
+ {%- if message.role == 'user' -%}
190
+ {%- set ns.in_inner = false -%}
191
+ {%- if ns.in_tool -%}
192
+ {{ ']' }}
193
+ {%- set ns.in_tool = false -%}
194
+ {%- endif -%}
195
+ {%- if ns.in_assistant -%}
196
+ {{ end_assistant_token }}
197
+ {%- set ns.in_assistant = false -%}
198
+ {%- endif -%}
199
+ {%- if "content" in message -%}
200
+ {{ user_token }}
201
+ {%- if message.content is string -%}
202
+ {{ message.content }}
203
+ {%- elif message.content is mapping and "parts" in message.content -%}
204
+ {%- set parts = message.content.parts -%}
205
+ {%- for part in parts -%}
206
+ {%- if part.type == "text" -%}
207
+ {{ part.text }}
208
+ {%- elif part.type == "image" -%}
209
+ {{ image_token }}
210
+ {%- else -%}
211
+ {{- raise_exception("Invalid user part: " + part.type) -}}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+ {%- else -%}
215
+ {{- raise_exception("Invalid user message: " + message.role) -}}
216
+ {%- endif -%}
217
+ {{ end_user_token }}
218
+ {%- endif -%}
219
+ {%- elif message.role == 'assistant' -%}
220
+ {%- if not ns.in_assistant -%}
221
+ {{ assistant_token }}
222
+ {%- set ns.in_assistant = true -%}
223
+ {%- endif -%}
224
+ {%- if "content" in message -%}
225
+ {%- if message.content is string and (ns.assistant_format is none or ns.assistant_format == "string") -%}
226
+ {%- if ns.in_tool -%}
227
+ {{ ']' }}
228
+ {%- set ns.in_tool = false -%}
229
+ {%- endif -%}
230
+ {%- set ns.assistant_format = "string" -%}
231
+ {{ message.content }}
232
+ {%- elif message.content is mapping and "blocks" in message.content and (ns.assistant_format is none or ns.assistant_format == "mapping") -%}
233
+ {%- set ns.assistant_format = "mapping" -%}
234
+ {%- set blocks = message.content.blocks -%}
235
+ {%- for block in blocks -%}
236
+ {%- if block.type == 'thoughts' -%}
237
+ {%- if ns.in_tool -%}
238
+ {{ ']' }}
239
+ {%- set ns.in_tool = false -%}
240
+ {%- endif -%}
241
+ {%- if not ns.in_inner -%}
242
+ {%- set ns.in_inner = true -%}
243
+ {{ inner_token }}
244
+ {%- endif -%}
245
+ {{ block.text }}
246
+ {%- elif block.type == 'tool_calls' -%}
247
+ {%- if ns.in_tool -%}
248
+ {{ ']' }}
249
+ {%- set ns.in_tool = false -%}
250
+ {%- endif -%}
251
+ {%- if ns.in_inner and not loop.first and block.calls|length == 1 and block.calls[0].name == 'display_answers' -%}
252
+ {%- set ns.in_inner = false -%}
253
+ {{ outer_token }}
254
+ {%- endif -%}
255
+ {{ tool_calls_token + '[' }}
256
+ {%- for tool_call in block.calls -%}
257
+ {{- '{"' + tool_call.name + '": ' + tool_call.arguments + '}' }}
258
+ {%- if not loop.last -%}
259
+ {{- ", " }}
260
+ {%- endif -%}
261
+ {%- endfor -%}
262
+ {{ ']' + end_tool_calls_token }}
263
+ {%- set ns.waiting_for_tool_outputs = true -%}
264
+ {%- elif block.type == 'tool_outputs' -%}
265
+ {%- if ns.in_tool -%}
266
+ {{- raise_exception("Cannot have both tool outputs as separate messages and tool outputs as blocks") -}}
267
+ {%- endif -%}
268
+ {{ '[' }}
269
+ {%- for tool_output in block.outputs -%}
270
+ {{- tool_output.output }}
271
+ {%- if not loop.last -%}
272
+ {{- ", " }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {{- ']' }}
276
+ {%- set ns.waiting_for_tool_outputs = false -%}
277
+ {%- elif block.type == 'response' -%}
278
+ {%- if ns.in_tool -%}
279
+ {{ ']' }}
280
+ {%- set ns.in_tool = false -%}
281
+ {%- endif -%}
282
+ {%- if (not loop.first and ns.in_inner) or (ns.in_assistant and ns.in_inner) -%}
283
+ {%- set ns.in_inner = false -%}
284
+ {{ outer_token }}
285
+ {%- endif -%}
286
+ {{ block.text }}
287
+ {%- else -%}
288
+ {{- raise_exception("Invalid assistant block type: " + block.type) -}}
289
+ {%- endif -%}
290
+ {%- endfor -%}
291
+ {%- else -%}
292
+ {{- raise_exception("Invalid assistant content") -}}
293
+ {%- endif -%}
294
+ {%- else -%}
295
+ {{- raise_exception("Invalid assistant message") -}}
296
+ {%- endif -%}
297
+ {%- if "tool_calls" in message and message.tool_calls -%}
298
+ {{ tool_calls_token + '[' }}
299
+ {%- for tool_call in message.tool_calls -%}
300
+ {%- if tool_call.type == 'function' -%}
301
+ {%- set function = tool_call.function -%}
302
+ {{- '{"' + function.name + '": ' + function.arguments + '}' }}
303
+ {%- if not loop.last -%}
304
+ {{- ", " }}
305
+ {%- endif -%}
306
+ {%- else -%}
307
+ {{- raise_exception("Invalid tool call type: " + tool_call.type) -}}
308
+ {%- endif -%}
309
+ {%- endfor -%}
310
+ {{ ']' + end_tool_calls_token }}
311
+ {%- set ns.waiting_for_tool_outputs = true -%}
312
+ {%- endif -%}
313
+ {%- elif message.role == 'tool' -%}
314
+ {%- if not ns.in_assistant -%}
315
+ {{- raise_exception("Tool message outside of assistant") -}}
316
+ {%- endif -%}
317
+ {%- if not ns.in_tool -%}
318
+ {{ '[' }}
319
+ {%- set ns.in_tool = true -%}
320
+ {%- else -%}
321
+ {{ ", "}}
322
+ {%- endif -%}
323
+ {{ message.content }}
324
+ {%- set ns.waiting_for_tool_outputs = false -%}
325
+ {%- else -%}
326
+ {{- raise_exception("Invalid message role") -}}
327
+ {%- endif -%}
328
+ {%- endfor -%}
329
+ {%- if ns.in_tool -%}
330
+ {{ ']' }}
331
+ {%- endif -%}
332
+ {%- if ns.in_assistant and not (continue_assistant_message is defined and continue_assistant_message) and not ns.waiting_for_tool_outputs -%}
333
+ {{ end_assistant_token }}
334
+ {%- endif -%}
335
+ {%- if add_generation_prompt -%}
336
+ {{ assistant_token }}
337
+ {%- endif -%}
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b2f7af3fb1b32a7ce631316d56c6c67e91ffa9a94529ff8f5de96fb2967c022
3
+ size 1275392867
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eba5392c47348ab7d81d8dffdd7d473ffb249a5c66f2f37d70d8035feb46408
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7a3ff09c0928c1ed20cef557112ba0c3ee8103da6d654553c274b6cc450de8
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|assistant_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 981,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.6546974384784698,
14
+ "epoch": 0.3058103975535168,
15
+ "grad_norm": 0.37161776423454285,
16
+ "learning_rate": 9.931806517013612e-06,
17
+ "loss": 2.1612,
18
+ "mean_token_accuracy": 0.5265661662817002,
19
+ "num_tokens": 41805.0,
20
+ "step": 100
21
+ },
22
+ {
23
+ "entropy": 0.7053046183288098,
24
+ "epoch": 0.6116207951070336,
25
+ "grad_norm": 0.14000245928764343,
26
+ "learning_rate": 9.381208231670505e-06,
27
+ "loss": 0.6959,
28
+ "mean_token_accuracy": 0.8424646317958832,
29
+ "num_tokens": 84002.0,
30
+ "step": 200
31
+ },
32
+ {
33
+ "entropy": 0.27937801614403723,
34
+ "epoch": 0.9174311926605505,
35
+ "grad_norm": 0.1330832690000534,
36
+ "learning_rate": 8.336447938411616e-06,
37
+ "loss": 0.2956,
38
+ "mean_token_accuracy": 0.9407258796691894,
39
+ "num_tokens": 125931.0,
40
+ "step": 300
41
+ },
42
+ {
43
+ "entropy": 0.2766046752035618,
44
+ "epoch": 1.2232415902140672,
45
+ "grad_norm": 0.25764134526252747,
46
+ "learning_rate": 6.915365469768857e-06,
47
+ "loss": 0.2872,
48
+ "mean_token_accuracy": 0.9397406846284866,
49
+ "num_tokens": 168641.0,
50
+ "step": 400
51
+ },
52
+ {
53
+ "entropy": 0.24791674077510834,
54
+ "epoch": 1.529051987767584,
55
+ "grad_norm": 0.17321506142616272,
56
+ "learning_rate": 5.2782465115587796e-06,
57
+ "loss": 0.2481,
58
+ "mean_token_accuracy": 0.9486746054887771,
59
+ "num_tokens": 210299.0,
60
+ "step": 500
61
+ },
62
+ {
63
+ "entropy": 0.24874767586588858,
64
+ "epoch": 1.834862385321101,
65
+ "grad_norm": 0.19254006445407867,
66
+ "learning_rate": 3.609743778011684e-06,
67
+ "loss": 0.2462,
68
+ "mean_token_accuracy": 0.9466655850410461,
69
+ "num_tokens": 252718.0,
70
+ "step": 600
71
+ },
72
+ {
73
+ "entropy": 0.23701502427458762,
74
+ "epoch": 2.140672782874618,
75
+ "grad_norm": 0.1829081028699875,
76
+ "learning_rate": 2.098049799001508e-06,
77
+ "loss": 0.2318,
78
+ "mean_token_accuracy": 0.9517926776409149,
79
+ "num_tokens": 294521.0,
80
+ "step": 700
81
+ },
82
+ {
83
+ "entropy": 0.249193025380373,
84
+ "epoch": 2.4464831804281344,
85
+ "grad_norm": 0.19727888703346252,
86
+ "learning_rate": 9.136704470001101e-07,
87
+ "loss": 0.2398,
88
+ "mean_token_accuracy": 0.9476780182123185,
89
+ "num_tokens": 337195.0,
90
+ "step": 800
91
+ },
92
+ {
93
+ "entropy": 0.22941243454813956,
94
+ "epoch": 2.7522935779816513,
95
+ "grad_norm": 0.21921473741531372,
96
+ "learning_rate": 1.9019336445009918e-07,
97
+ "loss": 0.222,
98
+ "mean_token_accuracy": 0.9518460136651993,
99
+ "num_tokens": 378540.0,
100
+ "step": 900
101
+ }
102
+ ],
103
+ "logging_steps": 100,
104
+ "max_steps": 981,
105
+ "num_input_tokens_seen": 0,
106
+ "num_train_epochs": 3,
107
+ "save_steps": 100,
108
+ "stateful_callbacks": {
109
+ "TrainerControl": {
110
+ "args": {
111
+ "should_epoch_stop": false,
112
+ "should_evaluate": false,
113
+ "should_log": false,
114
+ "should_save": true,
115
+ "should_training_stop": true
116
+ },
117
+ "attributes": {}
118
+ }
119
+ },
120
+ "total_flos": 1.8985572221907456e+16,
121
+ "train_batch_size": 1,
122
+ "trial_name": null,
123
+ "trial_params": null
124
+ }