Zual commited on
Commit
619febf
·
verified ·
1 Parent(s): 28a7d73

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention"
43
+ ],
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "model_type": "qwen3",
47
+ "num_attention_heads": 16,
48
+ "num_hidden_layers": 28,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 151645,
51
+ "rms_norm_eps": 1e-06,
52
+ "rope_scaling": null,
53
+ "rope_theta": 1000000,
54
+ "sliding_window": null,
55
+ "tie_word_embeddings": true,
56
+ "transformers_version": "4.57.3",
57
+ "use_cache": true,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 151669
60
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151645,
8
+ "temperature": 0.6,
9
+ "top_k": 20,
10
+ "top_p": 0.95,
11
+ "transformers_version": "4.57.3"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0a12eb3ad86c86c0b09b30112c5072c1c5ceeb9631e6ba89e4599eef8dc346
3
+ size 1191588280
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8472912f4fa3feeff5ee907deae55c921053351e7a7f6b36d8882b8237703e1
3
+ size 3628043
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef9acad5dfe6355012be16c2854f2d692974d86e83413f9cfdf1067fc122d44c
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6a531435eae04e8235804d04df93951cd9b6b4b80afa9c7e68f2ff2d1734526
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|im_end|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
trainer_state.json ADDED
@@ -0,0 +1,1890 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1995,
3
+ "best_metric": 0.13476963341236115,
4
+ "best_model_checkpoint": "models/qwen3-0.6b-distilled/checkpoint-1995",
5
+ "epoch": 1.8805090737685601,
6
+ "eval_steps": 35,
7
+ "global_step": 1995,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0009427292010370022,
14
+ "grad_norm": 38.25,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.0185,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.009427292010370021,
21
+ "grad_norm": 27.125,
22
+ "learning_rate": 1.8000000000000001e-06,
23
+ "loss": 0.8484,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.018854584020740042,
28
+ "grad_norm": 12.1875,
29
+ "learning_rate": 3.8000000000000005e-06,
30
+ "loss": 0.6648,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.028281876031110063,
35
+ "grad_norm": 15.5,
36
+ "learning_rate": 5.8e-06,
37
+ "loss": 0.4851,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 0.03299552203629508,
42
+ "eval_loss": 0.31027188897132874,
43
+ "eval_runtime": 3.3754,
44
+ "eval_samples_per_second": 12.739,
45
+ "eval_steps_per_second": 12.739,
46
+ "step": 35
47
+ },
48
+ {
49
+ "epoch": 0.037709168041480084,
50
+ "grad_norm": 10.75,
51
+ "learning_rate": 7.800000000000002e-06,
52
+ "loss": 0.4117,
53
+ "step": 40
54
+ },
55
+ {
56
+ "epoch": 0.047136460051850106,
57
+ "grad_norm": 9.4375,
58
+ "learning_rate": 9.800000000000001e-06,
59
+ "loss": 0.2625,
60
+ "step": 50
61
+ },
62
+ {
63
+ "epoch": 0.05656375206222013,
64
+ "grad_norm": 9.5625,
65
+ "learning_rate": 1.18e-05,
66
+ "loss": 0.2868,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 0.06599104407259015,
71
+ "grad_norm": 10.625,
72
+ "learning_rate": 1.38e-05,
73
+ "loss": 0.2278,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 0.06599104407259015,
78
+ "eval_loss": 0.22450809180736542,
79
+ "eval_runtime": 3.3497,
80
+ "eval_samples_per_second": 12.837,
81
+ "eval_steps_per_second": 12.837,
82
+ "step": 70
83
+ },
84
+ {
85
+ "epoch": 0.07541833608296017,
86
+ "grad_norm": 7.75,
87
+ "learning_rate": 1.58e-05,
88
+ "loss": 0.2753,
89
+ "step": 80
90
+ },
91
+ {
92
+ "epoch": 0.0848456280933302,
93
+ "grad_norm": 7.5625,
94
+ "learning_rate": 1.7800000000000002e-05,
95
+ "loss": 0.2747,
96
+ "step": 90
97
+ },
98
+ {
99
+ "epoch": 0.09427292010370021,
100
+ "grad_norm": 6.6875,
101
+ "learning_rate": 1.98e-05,
102
+ "loss": 0.2809,
103
+ "step": 100
104
+ },
105
+ {
106
+ "epoch": 0.09898656610888522,
107
+ "eval_loss": 0.21436083316802979,
108
+ "eval_runtime": 3.3464,
109
+ "eval_samples_per_second": 12.85,
110
+ "eval_steps_per_second": 12.85,
111
+ "step": 105
112
+ },
113
+ {
114
+ "epoch": 0.10370021211407024,
115
+ "grad_norm": 7.125,
116
+ "learning_rate": 1.991097922848665e-05,
117
+ "loss": 0.2175,
118
+ "step": 110
119
+ },
120
+ {
121
+ "epoch": 0.11312750412444025,
122
+ "grad_norm": 7.28125,
123
+ "learning_rate": 1.9812067260138478e-05,
124
+ "loss": 0.2321,
125
+ "step": 120
126
+ },
127
+ {
128
+ "epoch": 0.12255479613481028,
129
+ "grad_norm": 5.28125,
130
+ "learning_rate": 1.971315529179031e-05,
131
+ "loss": 0.1813,
132
+ "step": 130
133
+ },
134
+ {
135
+ "epoch": 0.1319820881451803,
136
+ "grad_norm": 5.59375,
137
+ "learning_rate": 1.9614243323442137e-05,
138
+ "loss": 0.1908,
139
+ "step": 140
140
+ },
141
+ {
142
+ "epoch": 0.1319820881451803,
143
+ "eval_loss": 0.19741062819957733,
144
+ "eval_runtime": 2.8143,
145
+ "eval_samples_per_second": 15.279,
146
+ "eval_steps_per_second": 15.279,
147
+ "step": 140
148
+ },
149
+ {
150
+ "epoch": 0.14140938015555032,
151
+ "grad_norm": 5.65625,
152
+ "learning_rate": 1.9515331355093966e-05,
153
+ "loss": 0.2125,
154
+ "step": 150
155
+ },
156
+ {
157
+ "epoch": 0.15083667216592034,
158
+ "grad_norm": 6.1875,
159
+ "learning_rate": 1.94164193867458e-05,
160
+ "loss": 0.2225,
161
+ "step": 160
162
+ },
163
+ {
164
+ "epoch": 0.16026396417629035,
165
+ "grad_norm": 6.375,
166
+ "learning_rate": 1.931750741839763e-05,
167
+ "loss": 0.2087,
168
+ "step": 170
169
+ },
170
+ {
171
+ "epoch": 0.16497761018147536,
172
+ "eval_loss": 0.2068619579076767,
173
+ "eval_runtime": 2.8432,
174
+ "eval_samples_per_second": 15.124,
175
+ "eval_steps_per_second": 15.124,
176
+ "step": 175
177
+ },
178
+ {
179
+ "epoch": 0.1696912561866604,
180
+ "grad_norm": 7.1875,
181
+ "learning_rate": 1.9218595450049458e-05,
182
+ "loss": 0.185,
183
+ "step": 180
184
+ },
185
+ {
186
+ "epoch": 0.1791185481970304,
187
+ "grad_norm": 6.78125,
188
+ "learning_rate": 1.9119683481701287e-05,
189
+ "loss": 0.1982,
190
+ "step": 190
191
+ },
192
+ {
193
+ "epoch": 0.18854584020740042,
194
+ "grad_norm": 5.96875,
195
+ "learning_rate": 1.9020771513353117e-05,
196
+ "loss": 0.2325,
197
+ "step": 200
198
+ },
199
+ {
200
+ "epoch": 0.19797313221777044,
201
+ "grad_norm": 9.0,
202
+ "learning_rate": 1.892185954500495e-05,
203
+ "loss": 0.1901,
204
+ "step": 210
205
+ },
206
+ {
207
+ "epoch": 0.19797313221777044,
208
+ "eval_loss": 0.18724432587623596,
209
+ "eval_runtime": 2.7949,
210
+ "eval_samples_per_second": 15.385,
211
+ "eval_steps_per_second": 15.385,
212
+ "step": 210
213
+ },
214
+ {
215
+ "epoch": 0.20740042422814048,
216
+ "grad_norm": 8.9375,
217
+ "learning_rate": 1.8822947576656775e-05,
218
+ "loss": 0.2276,
219
+ "step": 220
220
+ },
221
+ {
222
+ "epoch": 0.2168277162385105,
223
+ "grad_norm": 6.25,
224
+ "learning_rate": 1.8724035608308605e-05,
225
+ "loss": 0.1917,
226
+ "step": 230
227
+ },
228
+ {
229
+ "epoch": 0.2262550082488805,
230
+ "grad_norm": 6.34375,
231
+ "learning_rate": 1.8625123639960438e-05,
232
+ "loss": 0.1856,
233
+ "step": 240
234
+ },
235
+ {
236
+ "epoch": 0.23096865425406551,
237
+ "eval_loss": 0.17589253187179565,
238
+ "eval_runtime": 2.8142,
239
+ "eval_samples_per_second": 15.28,
240
+ "eval_steps_per_second": 15.28,
241
+ "step": 245
242
+ },
243
+ {
244
+ "epoch": 0.23568230025925052,
245
+ "grad_norm": 7.0625,
246
+ "learning_rate": 1.8526211671612267e-05,
247
+ "loss": 0.1666,
248
+ "step": 250
249
+ },
250
+ {
251
+ "epoch": 0.24510959226962056,
252
+ "grad_norm": 7.40625,
253
+ "learning_rate": 1.8427299703264096e-05,
254
+ "loss": 0.2121,
255
+ "step": 260
256
+ },
257
+ {
258
+ "epoch": 0.25453688427999055,
259
+ "grad_norm": 4.6875,
260
+ "learning_rate": 1.8328387734915926e-05,
261
+ "loss": 0.2028,
262
+ "step": 270
263
+ },
264
+ {
265
+ "epoch": 0.2639641762903606,
266
+ "grad_norm": 5.90625,
267
+ "learning_rate": 1.8229475766567755e-05,
268
+ "loss": 0.1484,
269
+ "step": 280
270
+ },
271
+ {
272
+ "epoch": 0.2639641762903606,
273
+ "eval_loss": 0.19002647697925568,
274
+ "eval_runtime": 2.8169,
275
+ "eval_samples_per_second": 15.265,
276
+ "eval_steps_per_second": 15.265,
277
+ "step": 280
278
+ },
279
+ {
280
+ "epoch": 0.27339146830073063,
281
+ "grad_norm": 4.71875,
282
+ "learning_rate": 1.8130563798219588e-05,
283
+ "loss": 0.1817,
284
+ "step": 290
285
+ },
286
+ {
287
+ "epoch": 0.28281876031110065,
288
+ "grad_norm": 6.78125,
289
+ "learning_rate": 1.8031651829871414e-05,
290
+ "loss": 0.2572,
291
+ "step": 300
292
+ },
293
+ {
294
+ "epoch": 0.29224605232147066,
295
+ "grad_norm": 6.1875,
296
+ "learning_rate": 1.7932739861523244e-05,
297
+ "loss": 0.1882,
298
+ "step": 310
299
+ },
300
+ {
301
+ "epoch": 0.29695969832665564,
302
+ "eval_loss": 0.16693313419818878,
303
+ "eval_runtime": 2.8909,
304
+ "eval_samples_per_second": 14.874,
305
+ "eval_steps_per_second": 14.874,
306
+ "step": 315
307
+ },
308
+ {
309
+ "epoch": 0.3016733443318407,
310
+ "grad_norm": 8.875,
311
+ "learning_rate": 1.7833827893175076e-05,
312
+ "loss": 0.2174,
313
+ "step": 320
314
+ },
315
+ {
316
+ "epoch": 0.3111006363422107,
317
+ "grad_norm": 5.96875,
318
+ "learning_rate": 1.7734915924826906e-05,
319
+ "loss": 0.1945,
320
+ "step": 330
321
+ },
322
+ {
323
+ "epoch": 0.3205279283525807,
324
+ "grad_norm": 8.8125,
325
+ "learning_rate": 1.7636003956478735e-05,
326
+ "loss": 0.197,
327
+ "step": 340
328
+ },
329
+ {
330
+ "epoch": 0.3299552203629507,
331
+ "grad_norm": 8.8125,
332
+ "learning_rate": 1.7537091988130565e-05,
333
+ "loss": 0.169,
334
+ "step": 350
335
+ },
336
+ {
337
+ "epoch": 0.3299552203629507,
338
+ "eval_loss": 0.16657419502735138,
339
+ "eval_runtime": 2.8397,
340
+ "eval_samples_per_second": 15.143,
341
+ "eval_steps_per_second": 15.143,
342
+ "step": 350
343
+ },
344
+ {
345
+ "epoch": 0.3393825123733208,
346
+ "grad_norm": 5.4375,
347
+ "learning_rate": 1.7438180019782397e-05,
348
+ "loss": 0.1996,
349
+ "step": 360
350
+ },
351
+ {
352
+ "epoch": 0.3488098043836908,
353
+ "grad_norm": 5.03125,
354
+ "learning_rate": 1.7339268051434227e-05,
355
+ "loss": 0.1717,
356
+ "step": 370
357
+ },
358
+ {
359
+ "epoch": 0.3582370963940608,
360
+ "grad_norm": 4.5,
361
+ "learning_rate": 1.7240356083086053e-05,
362
+ "loss": 0.1714,
363
+ "step": 380
364
+ },
365
+ {
366
+ "epoch": 0.3629507423992458,
367
+ "eval_loss": 0.16206760704517365,
368
+ "eval_runtime": 2.8698,
369
+ "eval_samples_per_second": 14.984,
370
+ "eval_steps_per_second": 14.984,
371
+ "step": 385
372
+ },
373
+ {
374
+ "epoch": 0.36766438840443083,
375
+ "grad_norm": 7.0,
376
+ "learning_rate": 1.7141444114737886e-05,
377
+ "loss": 0.1849,
378
+ "step": 390
379
+ },
380
+ {
381
+ "epoch": 0.37709168041480084,
382
+ "grad_norm": 10.75,
383
+ "learning_rate": 1.7042532146389715e-05,
384
+ "loss": 0.2017,
385
+ "step": 400
386
+ },
387
+ {
388
+ "epoch": 0.38651897242517086,
389
+ "grad_norm": 7.125,
390
+ "learning_rate": 1.6943620178041544e-05,
391
+ "loss": 0.1925,
392
+ "step": 410
393
+ },
394
+ {
395
+ "epoch": 0.3959462644355409,
396
+ "grad_norm": 5.46875,
397
+ "learning_rate": 1.6844708209693374e-05,
398
+ "loss": 0.1611,
399
+ "step": 420
400
+ },
401
+ {
402
+ "epoch": 0.3959462644355409,
403
+ "eval_loss": 0.1687425971031189,
404
+ "eval_runtime": 2.8778,
405
+ "eval_samples_per_second": 14.942,
406
+ "eval_steps_per_second": 14.942,
407
+ "step": 420
408
+ },
409
+ {
410
+ "epoch": 0.4053735564459109,
411
+ "grad_norm": 3.453125,
412
+ "learning_rate": 1.6745796241345203e-05,
413
+ "loss": 0.1983,
414
+ "step": 430
415
+ },
416
+ {
417
+ "epoch": 0.41480084845628096,
418
+ "grad_norm": 5.15625,
419
+ "learning_rate": 1.6646884272997036e-05,
420
+ "loss": 0.1961,
421
+ "step": 440
422
+ },
423
+ {
424
+ "epoch": 0.42422814046665097,
425
+ "grad_norm": 5.90625,
426
+ "learning_rate": 1.6547972304648865e-05,
427
+ "loss": 0.1952,
428
+ "step": 450
429
+ },
430
+ {
431
+ "epoch": 0.42894178647183595,
432
+ "eval_loss": 0.15973812341690063,
433
+ "eval_runtime": 2.8233,
434
+ "eval_samples_per_second": 15.23,
435
+ "eval_steps_per_second": 15.23,
436
+ "step": 455
437
+ },
438
+ {
439
+ "epoch": 0.433655432477021,
440
+ "grad_norm": 5.21875,
441
+ "learning_rate": 1.644906033630069e-05,
442
+ "loss": 0.1782,
443
+ "step": 460
444
+ },
445
+ {
446
+ "epoch": 0.443082724487391,
447
+ "grad_norm": 5.375,
448
+ "learning_rate": 1.6350148367952524e-05,
449
+ "loss": 0.1463,
450
+ "step": 470
451
+ },
452
+ {
453
+ "epoch": 0.452510016497761,
454
+ "grad_norm": 5.09375,
455
+ "learning_rate": 1.6251236399604354e-05,
456
+ "loss": 0.1774,
457
+ "step": 480
458
+ },
459
+ {
460
+ "epoch": 0.46193730850813103,
461
+ "grad_norm": 3.734375,
462
+ "learning_rate": 1.6152324431256183e-05,
463
+ "loss": 0.1956,
464
+ "step": 490
465
+ },
466
+ {
467
+ "epoch": 0.46193730850813103,
468
+ "eval_loss": 0.1619725376367569,
469
+ "eval_runtime": 2.8042,
470
+ "eval_samples_per_second": 15.334,
471
+ "eval_steps_per_second": 15.334,
472
+ "step": 490
473
+ },
474
+ {
475
+ "epoch": 0.47136460051850104,
476
+ "grad_norm": 6.75,
477
+ "learning_rate": 1.6053412462908013e-05,
478
+ "loss": 0.1327,
479
+ "step": 500
480
+ },
481
+ {
482
+ "epoch": 0.48079189252887106,
483
+ "grad_norm": 5.4375,
484
+ "learning_rate": 1.5954500494559842e-05,
485
+ "loss": 0.1798,
486
+ "step": 510
487
+ },
488
+ {
489
+ "epoch": 0.4902191845392411,
490
+ "grad_norm": 6.875,
491
+ "learning_rate": 1.5855588526211675e-05,
492
+ "loss": 0.18,
493
+ "step": 520
494
+ },
495
+ {
496
+ "epoch": 0.4949328305444261,
497
+ "eval_loss": 0.15706923604011536,
498
+ "eval_runtime": 2.8106,
499
+ "eval_samples_per_second": 15.299,
500
+ "eval_steps_per_second": 15.299,
501
+ "step": 525
502
+ },
503
+ {
504
+ "epoch": 0.49964647654961114,
505
+ "grad_norm": 5.25,
506
+ "learning_rate": 1.57566765578635e-05,
507
+ "loss": 0.1886,
508
+ "step": 530
509
+ },
510
+ {
511
+ "epoch": 0.5090737685599811,
512
+ "grad_norm": 4.46875,
513
+ "learning_rate": 1.565776458951533e-05,
514
+ "loss": 0.2052,
515
+ "step": 540
516
+ },
517
+ {
518
+ "epoch": 0.5185010605703512,
519
+ "grad_norm": 2.640625,
520
+ "learning_rate": 1.5558852621167163e-05,
521
+ "loss": 0.1506,
522
+ "step": 550
523
+ },
524
+ {
525
+ "epoch": 0.5279283525807212,
526
+ "grad_norm": 5.5625,
527
+ "learning_rate": 1.5459940652818992e-05,
528
+ "loss": 0.2149,
529
+ "step": 560
530
+ },
531
+ {
532
+ "epoch": 0.5279283525807212,
533
+ "eval_loss": 0.15407335758209229,
534
+ "eval_runtime": 2.9014,
535
+ "eval_samples_per_second": 14.821,
536
+ "eval_steps_per_second": 14.821,
537
+ "step": 560
538
+ },
539
+ {
540
+ "epoch": 0.5373556445910912,
541
+ "grad_norm": 4.28125,
542
+ "learning_rate": 1.5361028684470822e-05,
543
+ "loss": 0.1745,
544
+ "step": 570
545
+ },
546
+ {
547
+ "epoch": 0.5467829366014613,
548
+ "grad_norm": 4.625,
549
+ "learning_rate": 1.526211671612265e-05,
550
+ "loss": 0.1382,
551
+ "step": 580
552
+ },
553
+ {
554
+ "epoch": 0.5562102286118312,
555
+ "grad_norm": 7.5625,
556
+ "learning_rate": 1.5163204747774482e-05,
557
+ "loss": 0.1739,
558
+ "step": 590
559
+ },
560
+ {
561
+ "epoch": 0.5609238746170163,
562
+ "eval_loss": 0.15476632118225098,
563
+ "eval_runtime": 3.4281,
564
+ "eval_samples_per_second": 12.543,
565
+ "eval_steps_per_second": 12.543,
566
+ "step": 595
567
+ },
568
+ {
569
+ "epoch": 0.5656375206222013,
570
+ "grad_norm": 3.21875,
571
+ "learning_rate": 1.5064292779426313e-05,
572
+ "loss": 0.1492,
573
+ "step": 600
574
+ },
575
+ {
576
+ "epoch": 0.5750648126325713,
577
+ "grad_norm": 6.8125,
578
+ "learning_rate": 1.4965380811078141e-05,
579
+ "loss": 0.1487,
580
+ "step": 610
581
+ },
582
+ {
583
+ "epoch": 0.5844921046429413,
584
+ "grad_norm": 7.375,
585
+ "learning_rate": 1.486646884272997e-05,
586
+ "loss": 0.2154,
587
+ "step": 620
588
+ },
589
+ {
590
+ "epoch": 0.5939193966533113,
591
+ "grad_norm": 4.84375,
592
+ "learning_rate": 1.4767556874381802e-05,
593
+ "loss": 0.1679,
594
+ "step": 630
595
+ },
596
+ {
597
+ "epoch": 0.5939193966533113,
598
+ "eval_loss": 0.15013353526592255,
599
+ "eval_runtime": 2.7977,
600
+ "eval_samples_per_second": 15.37,
601
+ "eval_steps_per_second": 15.37,
602
+ "step": 630
603
+ },
604
+ {
605
+ "epoch": 0.6033466886636814,
606
+ "grad_norm": 6.96875,
607
+ "learning_rate": 1.4668644906033631e-05,
608
+ "loss": 0.182,
609
+ "step": 640
610
+ },
611
+ {
612
+ "epoch": 0.6127739806740514,
613
+ "grad_norm": 4.46875,
614
+ "learning_rate": 1.456973293768546e-05,
615
+ "loss": 0.1666,
616
+ "step": 650
617
+ },
618
+ {
619
+ "epoch": 0.6222012726844214,
620
+ "grad_norm": 4.65625,
621
+ "learning_rate": 1.4470820969337292e-05,
622
+ "loss": 0.1241,
623
+ "step": 660
624
+ },
625
+ {
626
+ "epoch": 0.6269149186896064,
627
+ "eval_loss": 0.1542421579360962,
628
+ "eval_runtime": 2.8955,
629
+ "eval_samples_per_second": 14.85,
630
+ "eval_steps_per_second": 14.85,
631
+ "step": 665
632
+ },
633
+ {
634
+ "epoch": 0.6316285646947915,
635
+ "grad_norm": 4.6875,
636
+ "learning_rate": 1.4371909000989121e-05,
637
+ "loss": 0.1584,
638
+ "step": 670
639
+ },
640
+ {
641
+ "epoch": 0.6410558567051614,
642
+ "grad_norm": 5.375,
643
+ "learning_rate": 1.4272997032640952e-05,
644
+ "loss": 0.1627,
645
+ "step": 680
646
+ },
647
+ {
648
+ "epoch": 0.6504831487155315,
649
+ "grad_norm": 4.78125,
650
+ "learning_rate": 1.417408506429278e-05,
651
+ "loss": 0.1207,
652
+ "step": 690
653
+ },
654
+ {
655
+ "epoch": 0.6599104407259014,
656
+ "grad_norm": 5.65625,
657
+ "learning_rate": 1.407517309594461e-05,
658
+ "loss": 0.2057,
659
+ "step": 700
660
+ },
661
+ {
662
+ "epoch": 0.6599104407259014,
663
+ "eval_loss": 0.15335653722286224,
664
+ "eval_runtime": 2.8317,
665
+ "eval_samples_per_second": 15.185,
666
+ "eval_steps_per_second": 15.185,
667
+ "step": 700
668
+ },
669
+ {
670
+ "epoch": 0.6693377327362715,
671
+ "grad_norm": 3.578125,
672
+ "learning_rate": 1.397626112759644e-05,
673
+ "loss": 0.1882,
674
+ "step": 710
675
+ },
676
+ {
677
+ "epoch": 0.6787650247466416,
678
+ "grad_norm": 9.5,
679
+ "learning_rate": 1.387734915924827e-05,
680
+ "loss": 0.1758,
681
+ "step": 720
682
+ },
683
+ {
684
+ "epoch": 0.6881923167570115,
685
+ "grad_norm": 5.9375,
686
+ "learning_rate": 1.3778437190900101e-05,
687
+ "loss": 0.1939,
688
+ "step": 730
689
+ },
690
+ {
691
+ "epoch": 0.6929059627621965,
692
+ "eval_loss": 0.15230855345726013,
693
+ "eval_runtime": 3.345,
694
+ "eval_samples_per_second": 12.855,
695
+ "eval_steps_per_second": 12.855,
696
+ "step": 735
697
+ },
698
+ {
699
+ "epoch": 0.6976196087673816,
700
+ "grad_norm": 4.71875,
701
+ "learning_rate": 1.367952522255193e-05,
702
+ "loss": 0.1548,
703
+ "step": 740
704
+ },
705
+ {
706
+ "epoch": 0.7070469007777516,
707
+ "grad_norm": 3.90625,
708
+ "learning_rate": 1.358061325420376e-05,
709
+ "loss": 0.1724,
710
+ "step": 750
711
+ },
712
+ {
713
+ "epoch": 0.7164741927881216,
714
+ "grad_norm": 5.125,
715
+ "learning_rate": 1.3481701285855591e-05,
716
+ "loss": 0.2,
717
+ "step": 760
718
+ },
719
+ {
720
+ "epoch": 0.7259014847984916,
721
+ "grad_norm": 3.296875,
722
+ "learning_rate": 1.3382789317507419e-05,
723
+ "loss": 0.1707,
724
+ "step": 770
725
+ },
726
+ {
727
+ "epoch": 0.7259014847984916,
728
+ "eval_loss": 0.14677099883556366,
729
+ "eval_runtime": 2.9348,
730
+ "eval_samples_per_second": 14.652,
731
+ "eval_steps_per_second": 14.652,
732
+ "step": 770
733
+ },
734
+ {
735
+ "epoch": 0.7353287768088617,
736
+ "grad_norm": 4.78125,
737
+ "learning_rate": 1.3283877349159248e-05,
738
+ "loss": 0.1631,
739
+ "step": 780
740
+ },
741
+ {
742
+ "epoch": 0.7447560688192316,
743
+ "grad_norm": 7.6875,
744
+ "learning_rate": 1.3184965380811079e-05,
745
+ "loss": 0.1723,
746
+ "step": 790
747
+ },
748
+ {
749
+ "epoch": 0.7541833608296017,
750
+ "grad_norm": 8.25,
751
+ "learning_rate": 1.3086053412462909e-05,
752
+ "loss": 0.1526,
753
+ "step": 800
754
+ },
755
+ {
756
+ "epoch": 0.7588970068347867,
757
+ "eval_loss": 0.14969274401664734,
758
+ "eval_runtime": 2.8149,
759
+ "eval_samples_per_second": 15.276,
760
+ "eval_steps_per_second": 15.276,
761
+ "step": 805
762
+ },
763
+ {
764
+ "epoch": 0.7636106528399718,
765
+ "grad_norm": 5.71875,
766
+ "learning_rate": 1.298714144411474e-05,
767
+ "loss": 0.1505,
768
+ "step": 810
769
+ },
770
+ {
771
+ "epoch": 0.7730379448503417,
772
+ "grad_norm": 3.984375,
773
+ "learning_rate": 1.2888229475766569e-05,
774
+ "loss": 0.1657,
775
+ "step": 820
776
+ },
777
+ {
778
+ "epoch": 0.7824652368607118,
779
+ "grad_norm": 5.5625,
780
+ "learning_rate": 1.27893175074184e-05,
781
+ "loss": 0.1558,
782
+ "step": 830
783
+ },
784
+ {
785
+ "epoch": 0.7918925288710817,
786
+ "grad_norm": 5.875,
787
+ "learning_rate": 1.2690405539070228e-05,
788
+ "loss": 0.1449,
789
+ "step": 840
790
+ },
791
+ {
792
+ "epoch": 0.7918925288710817,
793
+ "eval_loss": 0.147576704621315,
794
+ "eval_runtime": 2.8182,
795
+ "eval_samples_per_second": 15.258,
796
+ "eval_steps_per_second": 15.258,
797
+ "step": 840
798
+ },
799
+ {
800
+ "epoch": 0.8013198208814518,
801
+ "grad_norm": 7.84375,
802
+ "learning_rate": 1.2591493570722057e-05,
803
+ "loss": 0.178,
804
+ "step": 850
805
+ },
806
+ {
807
+ "epoch": 0.8107471128918218,
808
+ "grad_norm": 6.28125,
809
+ "learning_rate": 1.2492581602373888e-05,
810
+ "loss": 0.172,
811
+ "step": 860
812
+ },
813
+ {
814
+ "epoch": 0.8201744049021918,
815
+ "grad_norm": 4.625,
816
+ "learning_rate": 1.2393669634025718e-05,
817
+ "loss": 0.1812,
818
+ "step": 870
819
+ },
820
+ {
821
+ "epoch": 0.8248880509073768,
822
+ "eval_loss": 0.14615514874458313,
823
+ "eval_runtime": 2.8438,
824
+ "eval_samples_per_second": 15.12,
825
+ "eval_steps_per_second": 15.12,
826
+ "step": 875
827
+ },
828
+ {
829
+ "epoch": 0.8296016969125619,
830
+ "grad_norm": 4.53125,
831
+ "learning_rate": 1.2294757665677547e-05,
832
+ "loss": 0.1366,
833
+ "step": 880
834
+ },
835
+ {
836
+ "epoch": 0.8390289889229319,
837
+ "grad_norm": 2.265625,
838
+ "learning_rate": 1.2195845697329378e-05,
839
+ "loss": 0.1517,
840
+ "step": 890
841
+ },
842
+ {
843
+ "epoch": 0.8484562809333019,
844
+ "grad_norm": 5.15625,
845
+ "learning_rate": 1.2096933728981208e-05,
846
+ "loss": 0.1838,
847
+ "step": 900
848
+ },
849
+ {
850
+ "epoch": 0.8578835729436719,
851
+ "grad_norm": 4.375,
852
+ "learning_rate": 1.1998021760633039e-05,
853
+ "loss": 0.1513,
854
+ "step": 910
855
+ },
856
+ {
857
+ "epoch": 0.8578835729436719,
858
+ "eval_loss": 0.1435091495513916,
859
+ "eval_runtime": 2.8827,
860
+ "eval_samples_per_second": 14.917,
861
+ "eval_steps_per_second": 14.917,
862
+ "step": 910
863
+ },
864
+ {
865
+ "epoch": 0.867310864954042,
866
+ "grad_norm": 9.5,
867
+ "learning_rate": 1.1899109792284867e-05,
868
+ "loss": 0.1824,
869
+ "step": 920
870
+ },
871
+ {
872
+ "epoch": 0.8767381569644119,
873
+ "grad_norm": 4.125,
874
+ "learning_rate": 1.1800197823936696e-05,
875
+ "loss": 0.1749,
876
+ "step": 930
877
+ },
878
+ {
879
+ "epoch": 0.886165448974782,
880
+ "grad_norm": 1.546875,
881
+ "learning_rate": 1.1701285855588527e-05,
882
+ "loss": 0.1428,
883
+ "step": 940
884
+ },
885
+ {
886
+ "epoch": 0.890879094979967,
887
+ "eval_loss": 0.14254696667194366,
888
+ "eval_runtime": 3.4673,
889
+ "eval_samples_per_second": 12.401,
890
+ "eval_steps_per_second": 12.401,
891
+ "step": 945
892
+ },
893
+ {
894
+ "epoch": 0.8955927409851521,
895
+ "grad_norm": 4.65625,
896
+ "learning_rate": 1.1602373887240357e-05,
897
+ "loss": 0.1886,
898
+ "step": 950
899
+ },
900
+ {
901
+ "epoch": 0.905020032995522,
902
+ "grad_norm": 5.4375,
903
+ "learning_rate": 1.1503461918892188e-05,
904
+ "loss": 0.1705,
905
+ "step": 960
906
+ },
907
+ {
908
+ "epoch": 0.9144473250058921,
909
+ "grad_norm": 3.625,
910
+ "learning_rate": 1.1404549950544017e-05,
911
+ "loss": 0.1538,
912
+ "step": 970
913
+ },
914
+ {
915
+ "epoch": 0.9238746170162621,
916
+ "grad_norm": 5.71875,
917
+ "learning_rate": 1.1305637982195846e-05,
918
+ "loss": 0.1268,
919
+ "step": 980
920
+ },
921
+ {
922
+ "epoch": 0.9238746170162621,
923
+ "eval_loss": 0.1375056654214859,
924
+ "eval_runtime": 3.3962,
925
+ "eval_samples_per_second": 12.661,
926
+ "eval_steps_per_second": 12.661,
927
+ "step": 980
928
+ },
929
+ {
930
+ "epoch": 0.9333019090266321,
931
+ "grad_norm": 5.96875,
932
+ "learning_rate": 1.1206726013847678e-05,
933
+ "loss": 0.1882,
934
+ "step": 990
935
+ },
936
+ {
937
+ "epoch": 0.9427292010370021,
938
+ "grad_norm": 4.84375,
939
+ "learning_rate": 1.1107814045499505e-05,
940
+ "loss": 0.1376,
941
+ "step": 1000
942
+ },
943
+ {
944
+ "epoch": 0.9521564930473722,
945
+ "grad_norm": 6.0625,
946
+ "learning_rate": 1.1008902077151335e-05,
947
+ "loss": 0.1451,
948
+ "step": 1010
949
+ },
950
+ {
951
+ "epoch": 0.9568701390525571,
952
+ "eval_loss": 0.13927814364433289,
953
+ "eval_runtime": 2.8779,
954
+ "eval_samples_per_second": 14.941,
955
+ "eval_steps_per_second": 14.941,
956
+ "step": 1015
957
+ },
958
+ {
959
+ "epoch": 0.9615837850577421,
960
+ "grad_norm": 4.53125,
961
+ "learning_rate": 1.0909990108803166e-05,
962
+ "loss": 0.1524,
963
+ "step": 1020
964
+ },
965
+ {
966
+ "epoch": 0.9710110770681122,
967
+ "grad_norm": 6.4375,
968
+ "learning_rate": 1.0811078140454995e-05,
969
+ "loss": 0.1846,
970
+ "step": 1030
971
+ },
972
+ {
973
+ "epoch": 0.9804383690784823,
974
+ "grad_norm": 4.625,
975
+ "learning_rate": 1.0712166172106826e-05,
976
+ "loss": 0.1485,
977
+ "step": 1040
978
+ },
979
+ {
980
+ "epoch": 0.9898656610888522,
981
+ "grad_norm": 3.5625,
982
+ "learning_rate": 1.0613254203758656e-05,
983
+ "loss": 0.139,
984
+ "step": 1050
985
+ },
986
+ {
987
+ "epoch": 0.9898656610888522,
988
+ "eval_loss": 0.13826079666614532,
989
+ "eval_runtime": 2.8595,
990
+ "eval_samples_per_second": 15.038,
991
+ "eval_steps_per_second": 15.038,
992
+ "step": 1050
993
+ },
994
+ {
995
+ "epoch": 0.9992929530992223,
996
+ "grad_norm": 3.8125,
997
+ "learning_rate": 1.0514342235410487e-05,
998
+ "loss": 0.1477,
999
+ "step": 1060
1000
+ },
1001
+ {
1002
+ "epoch": 1.008484562809333,
1003
+ "grad_norm": 3.59375,
1004
+ "learning_rate": 1.0415430267062316e-05,
1005
+ "loss": 0.081,
1006
+ "step": 1070
1007
+ },
1008
+ {
1009
+ "epoch": 1.017911854819703,
1010
+ "grad_norm": 3.015625,
1011
+ "learning_rate": 1.0316518298714144e-05,
1012
+ "loss": 0.091,
1013
+ "step": 1080
1014
+ },
1015
+ {
1016
+ "epoch": 1.0226255008248881,
1017
+ "eval_loss": 0.14206524193286896,
1018
+ "eval_runtime": 2.8042,
1019
+ "eval_samples_per_second": 15.334,
1020
+ "eval_steps_per_second": 15.334,
1021
+ "step": 1085
1022
+ },
1023
+ {
1024
+ "epoch": 1.027339146830073,
1025
+ "grad_norm": 3.484375,
1026
+ "learning_rate": 1.0217606330365975e-05,
1027
+ "loss": 0.085,
1028
+ "step": 1090
1029
+ },
1030
+ {
1031
+ "epoch": 1.0367664388404432,
1032
+ "grad_norm": 3.546875,
1033
+ "learning_rate": 1.0118694362017805e-05,
1034
+ "loss": 0.0864,
1035
+ "step": 1100
1036
+ },
1037
+ {
1038
+ "epoch": 1.0461937308508131,
1039
+ "grad_norm": 2.515625,
1040
+ "learning_rate": 1.0019782393669636e-05,
1041
+ "loss": 0.087,
1042
+ "step": 1110
1043
+ },
1044
+ {
1045
+ "epoch": 1.055621022861183,
1046
+ "grad_norm": 3.25,
1047
+ "learning_rate": 9.920870425321465e-06,
1048
+ "loss": 0.0774,
1049
+ "step": 1120
1050
+ },
1051
+ {
1052
+ "epoch": 1.055621022861183,
1053
+ "eval_loss": 0.14084650576114655,
1054
+ "eval_runtime": 2.8547,
1055
+ "eval_samples_per_second": 15.063,
1056
+ "eval_steps_per_second": 15.063,
1057
+ "step": 1120
1058
+ },
1059
+ {
1060
+ "epoch": 1.065048314871553,
1061
+ "grad_norm": 2.46875,
1062
+ "learning_rate": 9.821958456973294e-06,
1063
+ "loss": 0.0486,
1064
+ "step": 1130
1065
+ },
1066
+ {
1067
+ "epoch": 1.0744756068819232,
1068
+ "grad_norm": 4.03125,
1069
+ "learning_rate": 9.723046488625124e-06,
1070
+ "loss": 0.0773,
1071
+ "step": 1140
1072
+ },
1073
+ {
1074
+ "epoch": 1.0839028988922932,
1075
+ "grad_norm": 8.6875,
1076
+ "learning_rate": 9.624134520276955e-06,
1077
+ "loss": 0.1005,
1078
+ "step": 1150
1079
+ },
1080
+ {
1081
+ "epoch": 1.0886165448974783,
1082
+ "eval_loss": 0.143312469124794,
1083
+ "eval_runtime": 2.7849,
1084
+ "eval_samples_per_second": 15.44,
1085
+ "eval_steps_per_second": 15.44,
1086
+ "step": 1155
1087
+ },
1088
+ {
1089
+ "epoch": 1.0933301909026631,
1090
+ "grad_norm": 2.8125,
1091
+ "learning_rate": 9.525222551928784e-06,
1092
+ "loss": 0.071,
1093
+ "step": 1160
1094
+ },
1095
+ {
1096
+ "epoch": 1.1027574829130333,
1097
+ "grad_norm": 7.8125,
1098
+ "learning_rate": 9.426310583580614e-06,
1099
+ "loss": 0.0725,
1100
+ "step": 1170
1101
+ },
1102
+ {
1103
+ "epoch": 1.1121847749234033,
1104
+ "grad_norm": 5.96875,
1105
+ "learning_rate": 9.327398615232443e-06,
1106
+ "loss": 0.081,
1107
+ "step": 1180
1108
+ },
1109
+ {
1110
+ "epoch": 1.1216120669337732,
1111
+ "grad_norm": 3.046875,
1112
+ "learning_rate": 9.228486646884274e-06,
1113
+ "loss": 0.0751,
1114
+ "step": 1190
1115
+ },
1116
+ {
1117
+ "epoch": 1.1216120669337732,
1118
+ "eval_loss": 0.14267787337303162,
1119
+ "eval_runtime": 2.9041,
1120
+ "eval_samples_per_second": 14.807,
1121
+ "eval_steps_per_second": 14.807,
1122
+ "step": 1190
1123
+ },
1124
+ {
1125
+ "epoch": 1.1310393589441432,
1126
+ "grad_norm": 3.390625,
1127
+ "learning_rate": 9.129574678536104e-06,
1128
+ "loss": 0.0721,
1129
+ "step": 1200
1130
+ },
1131
+ {
1132
+ "epoch": 1.1404666509545134,
1133
+ "grad_norm": 4.71875,
1134
+ "learning_rate": 9.030662710187933e-06,
1135
+ "loss": 0.0784,
1136
+ "step": 1210
1137
+ },
1138
+ {
1139
+ "epoch": 1.1498939429648833,
1140
+ "grad_norm": 4.03125,
1141
+ "learning_rate": 8.931750741839763e-06,
1142
+ "loss": 0.0764,
1143
+ "step": 1220
1144
+ },
1145
+ {
1146
+ "epoch": 1.1546075889700684,
1147
+ "eval_loss": 0.14501504600048065,
1148
+ "eval_runtime": 2.8486,
1149
+ "eval_samples_per_second": 15.095,
1150
+ "eval_steps_per_second": 15.095,
1151
+ "step": 1225
1152
+ },
1153
+ {
1154
+ "epoch": 1.1593212349752533,
1155
+ "grad_norm": 3.015625,
1156
+ "learning_rate": 8.832838773491594e-06,
1157
+ "loss": 0.0879,
1158
+ "step": 1230
1159
+ },
1160
+ {
1161
+ "epoch": 1.1687485269856235,
1162
+ "grad_norm": 2.8125,
1163
+ "learning_rate": 8.733926805143423e-06,
1164
+ "loss": 0.08,
1165
+ "step": 1240
1166
+ },
1167
+ {
1168
+ "epoch": 1.1781758189959934,
1169
+ "grad_norm": 3.984375,
1170
+ "learning_rate": 8.635014836795252e-06,
1171
+ "loss": 0.1068,
1172
+ "step": 1250
1173
+ },
1174
+ {
1175
+ "epoch": 1.1876031110063634,
1176
+ "grad_norm": 2.96875,
1177
+ "learning_rate": 8.536102868447082e-06,
1178
+ "loss": 0.078,
1179
+ "step": 1260
1180
+ },
1181
+ {
1182
+ "epoch": 1.1876031110063634,
1183
+ "eval_loss": 0.1404484212398529,
1184
+ "eval_runtime": 2.8927,
1185
+ "eval_samples_per_second": 14.865,
1186
+ "eval_steps_per_second": 14.865,
1187
+ "step": 1260
1188
+ },
1189
+ {
1190
+ "epoch": 1.1970304030167334,
1191
+ "grad_norm": 3.890625,
1192
+ "learning_rate": 8.437190900098913e-06,
1193
+ "loss": 0.1019,
1194
+ "step": 1270
1195
+ },
1196
+ {
1197
+ "epoch": 1.2064576950271035,
1198
+ "grad_norm": 6.25,
1199
+ "learning_rate": 8.338278931750742e-06,
1200
+ "loss": 0.09,
1201
+ "step": 1280
1202
+ },
1203
+ {
1204
+ "epoch": 1.2158849870374735,
1205
+ "grad_norm": 4.40625,
1206
+ "learning_rate": 8.239366963402572e-06,
1207
+ "loss": 0.0552,
1208
+ "step": 1290
1209
+ },
1210
+ {
1211
+ "epoch": 1.2205986330426586,
1212
+ "eval_loss": 0.14131127297878265,
1213
+ "eval_runtime": 2.8819,
1214
+ "eval_samples_per_second": 14.921,
1215
+ "eval_steps_per_second": 14.921,
1216
+ "step": 1295
1217
+ },
1218
+ {
1219
+ "epoch": 1.2253122790478435,
1220
+ "grad_norm": 3.515625,
1221
+ "learning_rate": 8.140454995054401e-06,
1222
+ "loss": 0.0674,
1223
+ "step": 1300
1224
+ },
1225
+ {
1226
+ "epoch": 1.2347395710582134,
1227
+ "grad_norm": 2.59375,
1228
+ "learning_rate": 8.041543026706232e-06,
1229
+ "loss": 0.1002,
1230
+ "step": 1310
1231
+ },
1232
+ {
1233
+ "epoch": 1.2441668630685836,
1234
+ "grad_norm": 3.59375,
1235
+ "learning_rate": 7.942631058358062e-06,
1236
+ "loss": 0.0711,
1237
+ "step": 1320
1238
+ },
1239
+ {
1240
+ "epoch": 1.2535941550789536,
1241
+ "grad_norm": 2.625,
1242
+ "learning_rate": 7.843719090009891e-06,
1243
+ "loss": 0.0699,
1244
+ "step": 1330
1245
+ },
1246
+ {
1247
+ "epoch": 1.2535941550789536,
1248
+ "eval_loss": 0.1402900665998459,
1249
+ "eval_runtime": 2.914,
1250
+ "eval_samples_per_second": 14.756,
1251
+ "eval_steps_per_second": 14.756,
1252
+ "step": 1330
1253
+ },
1254
+ {
1255
+ "epoch": 1.2630214470893235,
1256
+ "grad_norm": 3.328125,
1257
+ "learning_rate": 7.744807121661722e-06,
1258
+ "loss": 0.0965,
1259
+ "step": 1340
1260
+ },
1261
+ {
1262
+ "epoch": 1.2724487390996937,
1263
+ "grad_norm": 2.265625,
1264
+ "learning_rate": 7.645895153313552e-06,
1265
+ "loss": 0.078,
1266
+ "step": 1350
1267
+ },
1268
+ {
1269
+ "epoch": 1.2818760311100637,
1270
+ "grad_norm": 3.953125,
1271
+ "learning_rate": 7.546983184965382e-06,
1272
+ "loss": 0.0937,
1273
+ "step": 1360
1274
+ },
1275
+ {
1276
+ "epoch": 1.2865896771152485,
1277
+ "eval_loss": 0.13923154771327972,
1278
+ "eval_runtime": 2.9339,
1279
+ "eval_samples_per_second": 14.656,
1280
+ "eval_steps_per_second": 14.656,
1281
+ "step": 1365
1282
+ },
1283
+ {
1284
+ "epoch": 1.2913033231204336,
1285
+ "grad_norm": 7.21875,
1286
+ "learning_rate": 7.4480712166172105e-06,
1287
+ "loss": 0.0711,
1288
+ "step": 1370
1289
+ },
1290
+ {
1291
+ "epoch": 1.3007306151308038,
1292
+ "grad_norm": 4.75,
1293
+ "learning_rate": 7.349159248269041e-06,
1294
+ "loss": 0.091,
1295
+ "step": 1380
1296
+ },
1297
+ {
1298
+ "epoch": 1.3101579071411737,
1299
+ "grad_norm": 1.875,
1300
+ "learning_rate": 7.250247279920871e-06,
1301
+ "loss": 0.0494,
1302
+ "step": 1390
1303
+ },
1304
+ {
1305
+ "epoch": 1.3195851991515437,
1306
+ "grad_norm": 4.90625,
1307
+ "learning_rate": 7.151335311572701e-06,
1308
+ "loss": 0.1154,
1309
+ "step": 1400
1310
+ },
1311
+ {
1312
+ "epoch": 1.3195851991515437,
1313
+ "eval_loss": 0.1399063766002655,
1314
+ "eval_runtime": 2.8659,
1315
+ "eval_samples_per_second": 15.004,
1316
+ "eval_steps_per_second": 15.004,
1317
+ "step": 1400
1318
+ },
1319
+ {
1320
+ "epoch": 1.3290124911619137,
1321
+ "grad_norm": 3.59375,
1322
+ "learning_rate": 7.052423343224531e-06,
1323
+ "loss": 0.0782,
1324
+ "step": 1410
1325
+ },
1326
+ {
1327
+ "epoch": 1.3384397831722836,
1328
+ "grad_norm": 5.90625,
1329
+ "learning_rate": 6.95351137487636e-06,
1330
+ "loss": 0.0835,
1331
+ "step": 1420
1332
+ },
1333
+ {
1334
+ "epoch": 1.3478670751826538,
1335
+ "grad_norm": 3.84375,
1336
+ "learning_rate": 6.85459940652819e-06,
1337
+ "loss": 0.0856,
1338
+ "step": 1430
1339
+ },
1340
+ {
1341
+ "epoch": 1.352580721187839,
1342
+ "eval_loss": 0.1397552341222763,
1343
+ "eval_runtime": 2.8888,
1344
+ "eval_samples_per_second": 14.885,
1345
+ "eval_steps_per_second": 14.885,
1346
+ "step": 1435
1347
+ },
1348
+ {
1349
+ "epoch": 1.3572943671930238,
1350
+ "grad_norm": 2.625,
1351
+ "learning_rate": 6.755687438180021e-06,
1352
+ "loss": 0.049,
1353
+ "step": 1440
1354
+ },
1355
+ {
1356
+ "epoch": 1.3667216592033937,
1357
+ "grad_norm": 3.828125,
1358
+ "learning_rate": 6.65677546983185e-06,
1359
+ "loss": 0.0605,
1360
+ "step": 1450
1361
+ },
1362
+ {
1363
+ "epoch": 1.376148951213764,
1364
+ "grad_norm": 3.171875,
1365
+ "learning_rate": 6.55786350148368e-06,
1366
+ "loss": 0.1028,
1367
+ "step": 1460
1368
+ },
1369
+ {
1370
+ "epoch": 1.3855762432241339,
1371
+ "grad_norm": 3.46875,
1372
+ "learning_rate": 6.45895153313551e-06,
1373
+ "loss": 0.0687,
1374
+ "step": 1470
1375
+ },
1376
+ {
1377
+ "epoch": 1.3855762432241339,
1378
+ "eval_loss": 0.13968150317668915,
1379
+ "eval_runtime": 3.3677,
1380
+ "eval_samples_per_second": 12.769,
1381
+ "eval_steps_per_second": 12.769,
1382
+ "step": 1470
1383
+ },
1384
+ {
1385
+ "epoch": 1.3950035352345038,
1386
+ "grad_norm": 2.828125,
1387
+ "learning_rate": 6.36003956478734e-06,
1388
+ "loss": 0.078,
1389
+ "step": 1480
1390
+ },
1391
+ {
1392
+ "epoch": 1.404430827244874,
1393
+ "grad_norm": 5.46875,
1394
+ "learning_rate": 6.2611275964391694e-06,
1395
+ "loss": 0.0694,
1396
+ "step": 1490
1397
+ },
1398
+ {
1399
+ "epoch": 1.413858119255244,
1400
+ "grad_norm": 3.140625,
1401
+ "learning_rate": 6.162215628091e-06,
1402
+ "loss": 0.127,
1403
+ "step": 1500
1404
+ },
1405
+ {
1406
+ "epoch": 1.4185717652604288,
1407
+ "eval_loss": 0.1389547735452652,
1408
+ "eval_runtime": 2.8823,
1409
+ "eval_samples_per_second": 14.919,
1410
+ "eval_steps_per_second": 14.919,
1411
+ "step": 1505
1412
+ },
1413
+ {
1414
+ "epoch": 1.423285411265614,
1415
+ "grad_norm": 2.640625,
1416
+ "learning_rate": 6.06330365974283e-06,
1417
+ "loss": 0.0876,
1418
+ "step": 1510
1419
+ },
1420
+ {
1421
+ "epoch": 1.432712703275984,
1422
+ "grad_norm": 3.96875,
1423
+ "learning_rate": 5.964391691394659e-06,
1424
+ "loss": 0.0789,
1425
+ "step": 1520
1426
+ },
1427
+ {
1428
+ "epoch": 1.442139995286354,
1429
+ "grad_norm": 2.921875,
1430
+ "learning_rate": 5.865479723046489e-06,
1431
+ "loss": 0.0674,
1432
+ "step": 1530
1433
+ },
1434
+ {
1435
+ "epoch": 1.451567287296724,
1436
+ "grad_norm": 3.265625,
1437
+ "learning_rate": 5.766567754698319e-06,
1438
+ "loss": 0.0845,
1439
+ "step": 1540
1440
+ },
1441
+ {
1442
+ "epoch": 1.451567287296724,
1443
+ "eval_loss": 0.1385909616947174,
1444
+ "eval_runtime": 2.9439,
1445
+ "eval_samples_per_second": 14.606,
1446
+ "eval_steps_per_second": 14.606,
1447
+ "step": 1540
1448
+ },
1449
+ {
1450
+ "epoch": 1.460994579307094,
1451
+ "grad_norm": 5.3125,
1452
+ "learning_rate": 5.667655786350149e-06,
1453
+ "loss": 0.0718,
1454
+ "step": 1550
1455
+ },
1456
+ {
1457
+ "epoch": 1.470421871317464,
1458
+ "grad_norm": 5.53125,
1459
+ "learning_rate": 5.568743818001978e-06,
1460
+ "loss": 0.0957,
1461
+ "step": 1560
1462
+ },
1463
+ {
1464
+ "epoch": 1.4798491633278341,
1465
+ "grad_norm": 2.96875,
1466
+ "learning_rate": 5.469831849653808e-06,
1467
+ "loss": 0.0926,
1468
+ "step": 1570
1469
+ },
1470
+ {
1471
+ "epoch": 1.4845628093330192,
1472
+ "eval_loss": 0.13625992834568024,
1473
+ "eval_runtime": 3.3765,
1474
+ "eval_samples_per_second": 12.735,
1475
+ "eval_steps_per_second": 12.735,
1476
+ "step": 1575
1477
+ },
1478
+ {
1479
+ "epoch": 1.489276455338204,
1480
+ "grad_norm": 3.9375,
1481
+ "learning_rate": 5.370919881305638e-06,
1482
+ "loss": 0.0724,
1483
+ "step": 1580
1484
+ },
1485
+ {
1486
+ "epoch": 1.498703747348574,
1487
+ "grad_norm": 3.578125,
1488
+ "learning_rate": 5.272007912957469e-06,
1489
+ "loss": 0.0873,
1490
+ "step": 1590
1491
+ },
1492
+ {
1493
+ "epoch": 1.5081310393589442,
1494
+ "grad_norm": 5.53125,
1495
+ "learning_rate": 5.173095944609297e-06,
1496
+ "loss": 0.0958,
1497
+ "step": 1600
1498
+ },
1499
+ {
1500
+ "epoch": 1.5175583313693142,
1501
+ "grad_norm": 3.40625,
1502
+ "learning_rate": 5.0741839762611275e-06,
1503
+ "loss": 0.0539,
1504
+ "step": 1610
1505
+ },
1506
+ {
1507
+ "epoch": 1.5175583313693142,
1508
+ "eval_loss": 0.13553956151008606,
1509
+ "eval_runtime": 3.4118,
1510
+ "eval_samples_per_second": 12.603,
1511
+ "eval_steps_per_second": 12.603,
1512
+ "step": 1610
1513
+ },
1514
+ {
1515
+ "epoch": 1.5269856233796841,
1516
+ "grad_norm": 4.1875,
1517
+ "learning_rate": 4.975272007912958e-06,
1518
+ "loss": 0.0827,
1519
+ "step": 1620
1520
+ },
1521
+ {
1522
+ "epoch": 1.5364129153900543,
1523
+ "grad_norm": 2.65625,
1524
+ "learning_rate": 4.876360039564787e-06,
1525
+ "loss": 0.0771,
1526
+ "step": 1630
1527
+ },
1528
+ {
1529
+ "epoch": 1.5458402074004243,
1530
+ "grad_norm": 5.96875,
1531
+ "learning_rate": 4.7774480712166174e-06,
1532
+ "loss": 0.0769,
1533
+ "step": 1640
1534
+ },
1535
+ {
1536
+ "epoch": 1.5505538534056091,
1537
+ "eval_loss": 0.1363365650177002,
1538
+ "eval_runtime": 3.3765,
1539
+ "eval_samples_per_second": 12.735,
1540
+ "eval_steps_per_second": 12.735,
1541
+ "step": 1645
1542
+ },
1543
+ {
1544
+ "epoch": 1.5552674994107942,
1545
+ "grad_norm": 4.65625,
1546
+ "learning_rate": 4.678536102868448e-06,
1547
+ "loss": 0.0897,
1548
+ "step": 1650
1549
+ },
1550
+ {
1551
+ "epoch": 1.5646947914211644,
1552
+ "grad_norm": 5.0625,
1553
+ "learning_rate": 4.579624134520277e-06,
1554
+ "loss": 0.0871,
1555
+ "step": 1660
1556
+ },
1557
+ {
1558
+ "epoch": 1.5741220834315341,
1559
+ "grad_norm": 2.53125,
1560
+ "learning_rate": 4.480712166172107e-06,
1561
+ "loss": 0.0716,
1562
+ "step": 1670
1563
+ },
1564
+ {
1565
+ "epoch": 1.5835493754419043,
1566
+ "grad_norm": 1.7265625,
1567
+ "learning_rate": 4.381800197823937e-06,
1568
+ "loss": 0.067,
1569
+ "step": 1680
1570
+ },
1571
+ {
1572
+ "epoch": 1.5835493754419043,
1573
+ "eval_loss": 0.13660120964050293,
1574
+ "eval_runtime": 3.6215,
1575
+ "eval_samples_per_second": 11.874,
1576
+ "eval_steps_per_second": 11.874,
1577
+ "step": 1680
1578
+ },
1579
+ {
1580
+ "epoch": 1.5929766674522743,
1581
+ "grad_norm": 5.125,
1582
+ "learning_rate": 4.282888229475767e-06,
1583
+ "loss": 0.08,
1584
+ "step": 1690
1585
+ },
1586
+ {
1587
+ "epoch": 1.6024039594626442,
1588
+ "grad_norm": 4.0625,
1589
+ "learning_rate": 4.183976261127597e-06,
1590
+ "loss": 0.0766,
1591
+ "step": 1700
1592
+ },
1593
+ {
1594
+ "epoch": 1.6118312514730144,
1595
+ "grad_norm": 3.359375,
1596
+ "learning_rate": 4.085064292779427e-06,
1597
+ "loss": 0.0836,
1598
+ "step": 1710
1599
+ },
1600
+ {
1601
+ "epoch": 1.6165448974781995,
1602
+ "eval_loss": 0.13567590713500977,
1603
+ "eval_runtime": 2.8431,
1604
+ "eval_samples_per_second": 15.124,
1605
+ "eval_steps_per_second": 15.124,
1606
+ "step": 1715
1607
+ },
1608
+ {
1609
+ "epoch": 1.6212585434833844,
1610
+ "grad_norm": 2.1875,
1611
+ "learning_rate": 3.986152324431257e-06,
1612
+ "loss": 0.0617,
1613
+ "step": 1720
1614
+ },
1615
+ {
1616
+ "epoch": 1.6306858354937543,
1617
+ "grad_norm": 2.890625,
1618
+ "learning_rate": 3.887240356083086e-06,
1619
+ "loss": 0.1044,
1620
+ "step": 1730
1621
+ },
1622
+ {
1623
+ "epoch": 1.6401131275041245,
1624
+ "grad_norm": 3.390625,
1625
+ "learning_rate": 3.7883283877349162e-06,
1626
+ "loss": 0.0601,
1627
+ "step": 1740
1628
+ },
1629
+ {
1630
+ "epoch": 1.6495404195144945,
1631
+ "grad_norm": 2.59375,
1632
+ "learning_rate": 3.689416419386746e-06,
1633
+ "loss": 0.0494,
1634
+ "step": 1750
1635
+ },
1636
+ {
1637
+ "epoch": 1.6495404195144945,
1638
+ "eval_loss": 0.13626021146774292,
1639
+ "eval_runtime": 2.8485,
1640
+ "eval_samples_per_second": 15.096,
1641
+ "eval_steps_per_second": 15.096,
1642
+ "step": 1750
1643
+ },
1644
+ {
1645
+ "epoch": 1.6589677115248644,
1646
+ "grad_norm": 2.625,
1647
+ "learning_rate": 3.5905044510385763e-06,
1648
+ "loss": 0.0858,
1649
+ "step": 1760
1650
+ },
1651
+ {
1652
+ "epoch": 1.6683950035352346,
1653
+ "grad_norm": 2.890625,
1654
+ "learning_rate": 3.4915924826904058e-06,
1655
+ "loss": 0.0724,
1656
+ "step": 1770
1657
+ },
1658
+ {
1659
+ "epoch": 1.6778222955456044,
1660
+ "grad_norm": 4.15625,
1661
+ "learning_rate": 3.392680514342236e-06,
1662
+ "loss": 0.113,
1663
+ "step": 1780
1664
+ },
1665
+ {
1666
+ "epoch": 1.6825359415507894,
1667
+ "eval_loss": 0.13627412915229797,
1668
+ "eval_runtime": 2.8152,
1669
+ "eval_samples_per_second": 15.274,
1670
+ "eval_steps_per_second": 15.274,
1671
+ "step": 1785
1672
+ },
1673
+ {
1674
+ "epoch": 1.6872495875559745,
1675
+ "grad_norm": 2.484375,
1676
+ "learning_rate": 3.2937685459940654e-06,
1677
+ "loss": 0.0832,
1678
+ "step": 1790
1679
+ },
1680
+ {
1681
+ "epoch": 1.6966768795663447,
1682
+ "grad_norm": 2.734375,
1683
+ "learning_rate": 3.1948565776458957e-06,
1684
+ "loss": 0.0748,
1685
+ "step": 1800
1686
+ },
1687
+ {
1688
+ "epoch": 1.7061041715767145,
1689
+ "grad_norm": 5.625,
1690
+ "learning_rate": 3.095944609297725e-06,
1691
+ "loss": 0.0566,
1692
+ "step": 1810
1693
+ },
1694
+ {
1695
+ "epoch": 1.7155314635870846,
1696
+ "grad_norm": 2.484375,
1697
+ "learning_rate": 2.9970326409495554e-06,
1698
+ "loss": 0.0772,
1699
+ "step": 1820
1700
+ },
1701
+ {
1702
+ "epoch": 1.7155314635870846,
1703
+ "eval_loss": 0.1360086351633072,
1704
+ "eval_runtime": 2.8263,
1705
+ "eval_samples_per_second": 15.214,
1706
+ "eval_steps_per_second": 15.214,
1707
+ "step": 1820
1708
+ },
1709
+ {
1710
+ "epoch": 1.7249587555974546,
1711
+ "grad_norm": 2.96875,
1712
+ "learning_rate": 2.8981206726013848e-06,
1713
+ "loss": 0.0731,
1714
+ "step": 1830
1715
+ },
1716
+ {
1717
+ "epoch": 1.7343860476078246,
1718
+ "grad_norm": 2.890625,
1719
+ "learning_rate": 2.799208704253215e-06,
1720
+ "loss": 0.0547,
1721
+ "step": 1840
1722
+ },
1723
+ {
1724
+ "epoch": 1.7438133396181947,
1725
+ "grad_norm": 2.84375,
1726
+ "learning_rate": 2.700296735905045e-06,
1727
+ "loss": 0.0572,
1728
+ "step": 1850
1729
+ },
1730
+ {
1731
+ "epoch": 1.7485269856233798,
1732
+ "eval_loss": 0.13581496477127075,
1733
+ "eval_runtime": 3.4129,
1734
+ "eval_samples_per_second": 12.599,
1735
+ "eval_steps_per_second": 12.599,
1736
+ "step": 1855
1737
+ },
1738
+ {
1739
+ "epoch": 1.7532406316285647,
1740
+ "grad_norm": 3.0625,
1741
+ "learning_rate": 2.6013847675568747e-06,
1742
+ "loss": 0.0869,
1743
+ "step": 1860
1744
+ },
1745
+ {
1746
+ "epoch": 1.7626679236389347,
1747
+ "grad_norm": 1.234375,
1748
+ "learning_rate": 2.5024727992087046e-06,
1749
+ "loss": 0.0512,
1750
+ "step": 1870
1751
+ },
1752
+ {
1753
+ "epoch": 1.7720952156493048,
1754
+ "grad_norm": 6.1875,
1755
+ "learning_rate": 2.4035608308605344e-06,
1756
+ "loss": 0.0951,
1757
+ "step": 1880
1758
+ },
1759
+ {
1760
+ "epoch": 1.7815225076596748,
1761
+ "grad_norm": 4.09375,
1762
+ "learning_rate": 2.3046488625123642e-06,
1763
+ "loss": 0.108,
1764
+ "step": 1890
1765
+ },
1766
+ {
1767
+ "epoch": 1.7815225076596748,
1768
+ "eval_loss": 0.13570758700370789,
1769
+ "eval_runtime": 2.8749,
1770
+ "eval_samples_per_second": 14.957,
1771
+ "eval_steps_per_second": 14.957,
1772
+ "step": 1890
1773
+ },
1774
+ {
1775
+ "epoch": 1.7909497996700448,
1776
+ "grad_norm": 3.5,
1777
+ "learning_rate": 2.205736894164194e-06,
1778
+ "loss": 0.0624,
1779
+ "step": 1900
1780
+ },
1781
+ {
1782
+ "epoch": 1.800377091680415,
1783
+ "grad_norm": 3.046875,
1784
+ "learning_rate": 2.106824925816024e-06,
1785
+ "loss": 0.0644,
1786
+ "step": 1910
1787
+ },
1788
+ {
1789
+ "epoch": 1.8098043836907847,
1790
+ "grad_norm": 2.703125,
1791
+ "learning_rate": 2.0079129574678537e-06,
1792
+ "loss": 0.0756,
1793
+ "step": 1920
1794
+ },
1795
+ {
1796
+ "epoch": 1.8145180296959698,
1797
+ "eval_loss": 0.1355813890695572,
1798
+ "eval_runtime": 2.8479,
1799
+ "eval_samples_per_second": 15.099,
1800
+ "eval_steps_per_second": 15.099,
1801
+ "step": 1925
1802
+ },
1803
+ {
1804
+ "epoch": 1.8192316757011548,
1805
+ "grad_norm": 7.71875,
1806
+ "learning_rate": 1.9090009891196836e-06,
1807
+ "loss": 0.0725,
1808
+ "step": 1930
1809
+ },
1810
+ {
1811
+ "epoch": 1.828658967711525,
1812
+ "grad_norm": 2.25,
1813
+ "learning_rate": 1.8100890207715136e-06,
1814
+ "loss": 0.066,
1815
+ "step": 1940
1816
+ },
1817
+ {
1818
+ "epoch": 1.8380862597218948,
1819
+ "grad_norm": 2.296875,
1820
+ "learning_rate": 1.7111770524233435e-06,
1821
+ "loss": 0.072,
1822
+ "step": 1950
1823
+ },
1824
+ {
1825
+ "epoch": 1.847513551732265,
1826
+ "grad_norm": 5.78125,
1827
+ "learning_rate": 1.6122650840751733e-06,
1828
+ "loss": 0.092,
1829
+ "step": 1960
1830
+ },
1831
+ {
1832
+ "epoch": 1.847513551732265,
1833
+ "eval_loss": 0.13533347845077515,
1834
+ "eval_runtime": 3.3688,
1835
+ "eval_samples_per_second": 12.764,
1836
+ "eval_steps_per_second": 12.764,
1837
+ "step": 1960
1838
+ },
1839
+ {
1840
+ "epoch": 1.856940843742635,
1841
+ "grad_norm": 2.671875,
1842
+ "learning_rate": 1.5133531157270031e-06,
1843
+ "loss": 0.0586,
1844
+ "step": 1970
1845
+ },
1846
+ {
1847
+ "epoch": 1.8663681357530049,
1848
+ "grad_norm": 3.28125,
1849
+ "learning_rate": 1.414441147378833e-06,
1850
+ "loss": 0.0762,
1851
+ "step": 1980
1852
+ },
1853
+ {
1854
+ "epoch": 1.875795427763375,
1855
+ "grad_norm": 3.75,
1856
+ "learning_rate": 1.3155291790306628e-06,
1857
+ "loss": 0.0956,
1858
+ "step": 1990
1859
+ },
1860
+ {
1861
+ "epoch": 1.8805090737685601,
1862
+ "eval_loss": 0.13476963341236115,
1863
+ "eval_runtime": 2.8911,
1864
+ "eval_samples_per_second": 14.873,
1865
+ "eval_steps_per_second": 14.873,
1866
+ "step": 1995
1867
+ }
1868
+ ],
1869
+ "logging_steps": 10,
1870
+ "max_steps": 2122,
1871
+ "num_input_tokens_seen": 0,
1872
+ "num_train_epochs": 2,
1873
+ "save_steps": 35,
1874
+ "stateful_callbacks": {
1875
+ "TrainerControl": {
1876
+ "args": {
1877
+ "should_epoch_stop": false,
1878
+ "should_evaluate": false,
1879
+ "should_log": false,
1880
+ "should_save": true,
1881
+ "should_training_stop": false
1882
+ },
1883
+ "attributes": {}
1884
+ }
1885
+ },
1886
+ "total_flos": 6918461092528128.0,
1887
+ "train_batch_size": 1,
1888
+ "trial_name": null,
1889
+ "trial_params": null
1890
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5747935e6b612ba6cbefc4fc16d1fd863159328bb4c6d79b0afbb7afb99fc143
3
+ size 5777
vocab.json ADDED
The diff for this file is too large to render. See raw diff