C10X commited on
Commit
8b64de3
·
verified ·
1 Parent(s): 882626d

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if reasoning_content %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "float32",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 16,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 64,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 160,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention"
17
+ ],
18
+ "max_position_embeddings": 4096,
19
+ "max_window_layers": 2,
20
+ "model_type": "qwen3",
21
+ "num_attention_heads": 4,
22
+ "num_hidden_layers": 2,
23
+ "num_key_value_heads": 2,
24
+ "pad_token_id": 151643,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_scaling": null,
27
+ "rope_theta": 10000,
28
+ "sliding_window": null,
29
+ "tie_word_embeddings": true,
30
+ "transformers_version": "4.56.2",
31
+ "unsloth_version": "2025.11.2",
32
+ "use_cache": true,
33
+ "use_sliding_window": false,
34
+ "vocab_size": 151936
35
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645
5
+ ],
6
+ "max_length": 4096,
7
+ "max_new_tokens": 512,
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.56.2"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a328a651d15db3f2be8e7f3ebb9b82271ae67eb16fdd1f1befff1827c38da70c
3
+ size 39243784
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "padding_side": "right",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
trainer_state.json ADDED
@@ -0,0 +1,2224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3063,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0032653061224489797,
14
+ "grad_norm": 1.763192892074585,
15
+ "learning_rate": 0.0005,
16
+ "loss": 6.1164,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.006530612244897959,
21
+ "grad_norm": 1.1696356534957886,
22
+ "learning_rate": 0.0005,
23
+ "loss": 5.8499,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.009795918367346938,
28
+ "grad_norm": 1.1040449142456055,
29
+ "learning_rate": 0.0005,
30
+ "loss": 5.6782,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.013061224489795919,
35
+ "grad_norm": 1.3148446083068848,
36
+ "learning_rate": 0.0005,
37
+ "loss": 5.6237,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.0163265306122449,
42
+ "grad_norm": 1.438623070716858,
43
+ "learning_rate": 0.0005,
44
+ "loss": 5.5178,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.019591836734693877,
49
+ "grad_norm": 1.2186000347137451,
50
+ "learning_rate": 0.0005,
51
+ "loss": 5.5895,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.022857142857142857,
56
+ "grad_norm": 1.088930606842041,
57
+ "learning_rate": 0.0005,
58
+ "loss": 5.4889,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.026122448979591838,
63
+ "grad_norm": 1.1969929933547974,
64
+ "learning_rate": 0.0005,
65
+ "loss": 5.3515,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.029387755102040815,
70
+ "grad_norm": 1.0405751466751099,
71
+ "learning_rate": 0.0005,
72
+ "loss": 5.2943,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.0326530612244898,
77
+ "grad_norm": 1.6840550899505615,
78
+ "learning_rate": 0.0005,
79
+ "loss": 5.1347,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.035918367346938776,
84
+ "grad_norm": 1.0448979139328003,
85
+ "learning_rate": 0.0005,
86
+ "loss": 5.2275,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.03918367346938775,
91
+ "grad_norm": 1.2239587306976318,
92
+ "learning_rate": 0.0005,
93
+ "loss": 5.1866,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.04244897959183674,
98
+ "grad_norm": 1.0111207962036133,
99
+ "learning_rate": 0.0005,
100
+ "loss": 5.1201,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.045714285714285714,
105
+ "grad_norm": 1.0412639379501343,
106
+ "learning_rate": 0.0005,
107
+ "loss": 5.0317,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.04897959183673469,
112
+ "grad_norm": 1.6490758657455444,
113
+ "learning_rate": 0.0005,
114
+ "loss": 4.8709,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.052244897959183675,
119
+ "grad_norm": 1.320185661315918,
120
+ "learning_rate": 0.0005,
121
+ "loss": 5.0482,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.05551020408163265,
126
+ "grad_norm": 1.23984694480896,
127
+ "learning_rate": 0.0005,
128
+ "loss": 5.1341,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.05877551020408163,
133
+ "grad_norm": 1.0305520296096802,
134
+ "learning_rate": 0.0005,
135
+ "loss": 4.9743,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.062040816326530614,
140
+ "grad_norm": 1.1690291166305542,
141
+ "learning_rate": 0.0005,
142
+ "loss": 4.8896,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.0653061224489796,
147
+ "grad_norm": 1.5160081386566162,
148
+ "learning_rate": 0.0005,
149
+ "loss": 4.7129,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.06857142857142857,
154
+ "grad_norm": 1.293174147605896,
155
+ "learning_rate": 0.0005,
156
+ "loss": 4.8936,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.07183673469387755,
161
+ "grad_norm": 0.9752352237701416,
162
+ "learning_rate": 0.0005,
163
+ "loss": 4.9842,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.07510204081632653,
168
+ "grad_norm": 1.2358481884002686,
169
+ "learning_rate": 0.0005,
170
+ "loss": 4.8956,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0783673469387755,
175
+ "grad_norm": 1.4549710750579834,
176
+ "learning_rate": 0.0005,
177
+ "loss": 4.7804,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.08163265306122448,
182
+ "grad_norm": 1.6275160312652588,
183
+ "learning_rate": 0.0005,
184
+ "loss": 4.6489,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.08489795918367347,
189
+ "grad_norm": 1.3446155786514282,
190
+ "learning_rate": 0.0005,
191
+ "loss": 4.8603,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.08816326530612245,
196
+ "grad_norm": 1.3125636577606201,
197
+ "learning_rate": 0.0005,
198
+ "loss": 4.8614,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.09142857142857143,
203
+ "grad_norm": 1.5089269876480103,
204
+ "learning_rate": 0.0005,
205
+ "loss": 4.7528,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0946938775510204,
210
+ "grad_norm": 1.240677833557129,
211
+ "learning_rate": 0.0005,
212
+ "loss": 4.6491,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.09795918367346938,
217
+ "grad_norm": 1.6580984592437744,
218
+ "learning_rate": 0.0005,
219
+ "loss": 4.6021,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.10122448979591837,
224
+ "grad_norm": 1.293299674987793,
225
+ "learning_rate": 0.0005,
226
+ "loss": 4.6556,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.10448979591836735,
231
+ "grad_norm": 1.0091382265090942,
232
+ "learning_rate": 0.0005,
233
+ "loss": 4.8806,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.10775510204081633,
238
+ "grad_norm": 1.2688143253326416,
239
+ "learning_rate": 0.0005,
240
+ "loss": 4.706,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.1110204081632653,
245
+ "grad_norm": 1.514876365661621,
246
+ "learning_rate": 0.0005,
247
+ "loss": 4.649,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.11428571428571428,
252
+ "grad_norm": 1.8324439525604248,
253
+ "learning_rate": 0.0005,
254
+ "loss": 4.4951,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.11755102040816326,
259
+ "grad_norm": 1.6239265203475952,
260
+ "learning_rate": 0.0005,
261
+ "loss": 4.5715,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.12081632653061225,
266
+ "grad_norm": 1.2813515663146973,
267
+ "learning_rate": 0.0005,
268
+ "loss": 4.8563,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.12408163265306123,
273
+ "grad_norm": 1.2220065593719482,
274
+ "learning_rate": 0.0005,
275
+ "loss": 4.6549,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.1273469387755102,
280
+ "grad_norm": 1.176604986190796,
281
+ "learning_rate": 0.0005,
282
+ "loss": 4.5817,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.1306122448979592,
287
+ "grad_norm": 1.7251240015029907,
288
+ "learning_rate": 0.0005,
289
+ "loss": 4.4422,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.13387755102040816,
294
+ "grad_norm": 1.2302827835083008,
295
+ "learning_rate": 0.0005,
296
+ "loss": 4.4658,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.13714285714285715,
301
+ "grad_norm": 1.0500273704528809,
302
+ "learning_rate": 0.0005,
303
+ "loss": 4.722,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.1404081632653061,
308
+ "grad_norm": 1.2972230911254883,
309
+ "learning_rate": 0.0005,
310
+ "loss": 4.6021,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.1436734693877551,
315
+ "grad_norm": 1.294978141784668,
316
+ "learning_rate": 0.0005,
317
+ "loss": 4.4754,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.1469387755102041,
322
+ "grad_norm": 1.6263995170593262,
323
+ "learning_rate": 0.0005,
324
+ "loss": 4.4881,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.15020408163265306,
329
+ "grad_norm": 1.2330529689788818,
330
+ "learning_rate": 0.0005,
331
+ "loss": 4.4441,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.15346938775510205,
336
+ "grad_norm": 0.8953180909156799,
337
+ "learning_rate": 0.0005,
338
+ "loss": 4.6281,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.156734693877551,
343
+ "grad_norm": 1.2303706407546997,
344
+ "learning_rate": 0.0005,
345
+ "loss": 4.5374,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.16,
350
+ "grad_norm": 1.4355968236923218,
351
+ "learning_rate": 0.0005,
352
+ "loss": 4.5039,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.16326530612244897,
357
+ "grad_norm": 2.549136161804199,
358
+ "learning_rate": 0.0005,
359
+ "loss": 4.3848,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.16326530612244897,
364
+ "eval_loss": 4.471920967102051,
365
+ "eval_runtime": 44.0079,
366
+ "eval_samples_per_second": 45.446,
367
+ "eval_steps_per_second": 11.362,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 0.16653061224489796,
372
+ "grad_norm": 1.3125051259994507,
373
+ "learning_rate": 0.0005,
374
+ "loss": 4.429,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 0.16979591836734695,
379
+ "grad_norm": 1.1771793365478516,
380
+ "learning_rate": 0.0005,
381
+ "loss": 4.6117,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 0.1730612244897959,
386
+ "grad_norm": 1.0817294120788574,
387
+ "learning_rate": 0.0005,
388
+ "loss": 4.5011,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 0.1763265306122449,
393
+ "grad_norm": 1.7551524639129639,
394
+ "learning_rate": 0.0005,
395
+ "loss": 4.4272,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 0.17959183673469387,
400
+ "grad_norm": 2.062394142150879,
401
+ "learning_rate": 0.0005,
402
+ "loss": 4.2264,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 0.18285714285714286,
407
+ "grad_norm": 1.150291919708252,
408
+ "learning_rate": 0.0005,
409
+ "loss": 4.2967,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 0.18612244897959185,
414
+ "grad_norm": 1.318264126777649,
415
+ "learning_rate": 0.0005,
416
+ "loss": 4.5895,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 0.1893877551020408,
421
+ "grad_norm": 1.267132043838501,
422
+ "learning_rate": 0.0005,
423
+ "loss": 4.4743,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 0.1926530612244898,
428
+ "grad_norm": 1.0571893453598022,
429
+ "learning_rate": 0.0005,
430
+ "loss": 4.3215,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 0.19591836734693877,
435
+ "grad_norm": 1.7111858129501343,
436
+ "learning_rate": 0.0005,
437
+ "loss": 4.3655,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 0.19918367346938776,
442
+ "grad_norm": 1.3666787147521973,
443
+ "learning_rate": 0.0005,
444
+ "loss": 4.2062,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 0.20244897959183675,
449
+ "grad_norm": 1.1153773069381714,
450
+ "learning_rate": 0.0005,
451
+ "loss": 4.5956,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 0.2057142857142857,
456
+ "grad_norm": 1.3033626079559326,
457
+ "learning_rate": 0.0005,
458
+ "loss": 4.3626,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 0.2089795918367347,
463
+ "grad_norm": 1.2020397186279297,
464
+ "learning_rate": 0.0005,
465
+ "loss": 4.2874,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 0.21224489795918366,
470
+ "grad_norm": 1.620437741279602,
471
+ "learning_rate": 0.0005,
472
+ "loss": 4.3094,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 0.21551020408163266,
477
+ "grad_norm": 1.3368499279022217,
478
+ "learning_rate": 0.0005,
479
+ "loss": 4.1476,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 0.21877551020408162,
484
+ "grad_norm": 1.3473259210586548,
485
+ "learning_rate": 0.0005,
486
+ "loss": 4.515,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 0.2220408163265306,
491
+ "grad_norm": 1.3251821994781494,
492
+ "learning_rate": 0.0005,
493
+ "loss": 4.2691,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 0.2253061224489796,
498
+ "grad_norm": 1.4138338565826416,
499
+ "learning_rate": 0.0005,
500
+ "loss": 4.2523,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 0.22857142857142856,
505
+ "grad_norm": 1.8546305894851685,
506
+ "learning_rate": 0.0005,
507
+ "loss": 4.2569,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 0.23183673469387756,
512
+ "grad_norm": 1.3483269214630127,
513
+ "learning_rate": 0.0005,
514
+ "loss": 4.0691,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 0.23510204081632652,
519
+ "grad_norm": 1.0672351121902466,
520
+ "learning_rate": 0.0005,
521
+ "loss": 4.4731,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 0.2383673469387755,
526
+ "grad_norm": 1.168258786201477,
527
+ "learning_rate": 0.0005,
528
+ "loss": 4.4081,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 0.2416326530612245,
533
+ "grad_norm": 1.1797970533370972,
534
+ "learning_rate": 0.0005,
535
+ "loss": 4.2101,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 0.24489795918367346,
540
+ "grad_norm": 1.774629831314087,
541
+ "learning_rate": 0.0005,
542
+ "loss": 4.1419,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 0.24816326530612245,
547
+ "grad_norm": 1.4174522161483765,
548
+ "learning_rate": 0.0005,
549
+ "loss": 4.0785,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 0.25142857142857145,
554
+ "grad_norm": 1.1591559648513794,
555
+ "learning_rate": 0.0005,
556
+ "loss": 4.4851,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 0.2546938775510204,
561
+ "grad_norm": 1.3229284286499023,
562
+ "learning_rate": 0.0005,
563
+ "loss": 4.2819,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 0.25795918367346937,
568
+ "grad_norm": 1.174056887626648,
569
+ "learning_rate": 0.0005,
570
+ "loss": 4.1234,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 0.2612244897959184,
575
+ "grad_norm": 1.6990975141525269,
576
+ "learning_rate": 0.0005,
577
+ "loss": 4.0965,
578
+ "step": 800
579
+ },
580
+ {
581
+ "epoch": 0.26448979591836735,
582
+ "grad_norm": 1.4492194652557373,
583
+ "learning_rate": 0.0005,
584
+ "loss": 4.0653,
585
+ "step": 810
586
+ },
587
+ {
588
+ "epoch": 0.2677551020408163,
589
+ "grad_norm": 0.9765328764915466,
590
+ "learning_rate": 0.0005,
591
+ "loss": 4.4531,
592
+ "step": 820
593
+ },
594
+ {
595
+ "epoch": 0.2710204081632653,
596
+ "grad_norm": 1.4868978261947632,
597
+ "learning_rate": 0.0005,
598
+ "loss": 4.2941,
599
+ "step": 830
600
+ },
601
+ {
602
+ "epoch": 0.2742857142857143,
603
+ "grad_norm": 1.3501718044281006,
604
+ "learning_rate": 0.0005,
605
+ "loss": 4.1963,
606
+ "step": 840
607
+ },
608
+ {
609
+ "epoch": 0.27755102040816326,
610
+ "grad_norm": 1.9492390155792236,
611
+ "learning_rate": 0.0005,
612
+ "loss": 4.1563,
613
+ "step": 850
614
+ },
615
+ {
616
+ "epoch": 0.2808163265306122,
617
+ "grad_norm": 1.2434947490692139,
618
+ "learning_rate": 0.0005,
619
+ "loss": 4.0553,
620
+ "step": 860
621
+ },
622
+ {
623
+ "epoch": 0.28408163265306124,
624
+ "grad_norm": 1.1206755638122559,
625
+ "learning_rate": 0.0005,
626
+ "loss": 4.3843,
627
+ "step": 870
628
+ },
629
+ {
630
+ "epoch": 0.2873469387755102,
631
+ "grad_norm": 1.1947944164276123,
632
+ "learning_rate": 0.0005,
633
+ "loss": 4.2625,
634
+ "step": 880
635
+ },
636
+ {
637
+ "epoch": 0.29061224489795917,
638
+ "grad_norm": 1.1833423376083374,
639
+ "learning_rate": 0.0005,
640
+ "loss": 4.1207,
641
+ "step": 890
642
+ },
643
+ {
644
+ "epoch": 0.2938775510204082,
645
+ "grad_norm": 1.6545424461364746,
646
+ "learning_rate": 0.0005,
647
+ "loss": 4.1006,
648
+ "step": 900
649
+ },
650
+ {
651
+ "epoch": 0.29714285714285715,
652
+ "grad_norm": 1.1993494033813477,
653
+ "learning_rate": 0.0005,
654
+ "loss": 3.9992,
655
+ "step": 910
656
+ },
657
+ {
658
+ "epoch": 0.3004081632653061,
659
+ "grad_norm": 1.017640233039856,
660
+ "learning_rate": 0.0005,
661
+ "loss": 4.3819,
662
+ "step": 920
663
+ },
664
+ {
665
+ "epoch": 0.3036734693877551,
666
+ "grad_norm": 1.300307273864746,
667
+ "learning_rate": 0.0005,
668
+ "loss": 4.182,
669
+ "step": 930
670
+ },
671
+ {
672
+ "epoch": 0.3069387755102041,
673
+ "grad_norm": 1.4982844591140747,
674
+ "learning_rate": 0.0005,
675
+ "loss": 4.0585,
676
+ "step": 940
677
+ },
678
+ {
679
+ "epoch": 0.31020408163265306,
680
+ "grad_norm": 1.8079185485839844,
681
+ "learning_rate": 0.0005,
682
+ "loss": 4.0825,
683
+ "step": 950
684
+ },
685
+ {
686
+ "epoch": 0.313469387755102,
687
+ "grad_norm": 1.2230055332183838,
688
+ "learning_rate": 0.0005,
689
+ "loss": 3.9335,
690
+ "step": 960
691
+ },
692
+ {
693
+ "epoch": 0.31673469387755104,
694
+ "grad_norm": 1.372392177581787,
695
+ "learning_rate": 0.0005,
696
+ "loss": 4.3513,
697
+ "step": 970
698
+ },
699
+ {
700
+ "epoch": 0.32,
701
+ "grad_norm": 1.4195263385772705,
702
+ "learning_rate": 0.0005,
703
+ "loss": 4.1446,
704
+ "step": 980
705
+ },
706
+ {
707
+ "epoch": 0.32326530612244897,
708
+ "grad_norm": 1.5404787063598633,
709
+ "learning_rate": 0.0005,
710
+ "loss": 4.0195,
711
+ "step": 990
712
+ },
713
+ {
714
+ "epoch": 0.32653061224489793,
715
+ "grad_norm": 1.884207844734192,
716
+ "learning_rate": 0.0005,
717
+ "loss": 4.0313,
718
+ "step": 1000
719
+ },
720
+ {
721
+ "epoch": 0.32653061224489793,
722
+ "eval_loss": 4.118765354156494,
723
+ "eval_runtime": 43.1312,
724
+ "eval_samples_per_second": 46.37,
725
+ "eval_steps_per_second": 11.593,
726
+ "step": 1000
727
+ },
728
+ {
729
+ "epoch": 0.32979591836734695,
730
+ "grad_norm": 1.2172777652740479,
731
+ "learning_rate": 0.0005,
732
+ "loss": 3.8635,
733
+ "step": 1010
734
+ },
735
+ {
736
+ "epoch": 0.3330612244897959,
737
+ "grad_norm": 1.3385504484176636,
738
+ "learning_rate": 0.0005,
739
+ "loss": 4.2695,
740
+ "step": 1020
741
+ },
742
+ {
743
+ "epoch": 0.3363265306122449,
744
+ "grad_norm": 1.2645728588104248,
745
+ "learning_rate": 0.0005,
746
+ "loss": 4.1298,
747
+ "step": 1030
748
+ },
749
+ {
750
+ "epoch": 0.3395918367346939,
751
+ "grad_norm": 1.1611703634262085,
752
+ "learning_rate": 0.0005,
753
+ "loss": 4.0871,
754
+ "step": 1040
755
+ },
756
+ {
757
+ "epoch": 0.34285714285714286,
758
+ "grad_norm": 1.9043705463409424,
759
+ "learning_rate": 0.0005,
760
+ "loss": 4.0687,
761
+ "step": 1050
762
+ },
763
+ {
764
+ "epoch": 0.3461224489795918,
765
+ "grad_norm": 1.109143853187561,
766
+ "learning_rate": 0.0005,
767
+ "loss": 3.9311,
768
+ "step": 1060
769
+ },
770
+ {
771
+ "epoch": 0.34938775510204084,
772
+ "grad_norm": 1.3506762981414795,
773
+ "learning_rate": 0.0005,
774
+ "loss": 4.2937,
775
+ "step": 1070
776
+ },
777
+ {
778
+ "epoch": 0.3526530612244898,
779
+ "grad_norm": 1.5633002519607544,
780
+ "learning_rate": 0.0005,
781
+ "loss": 4.0552,
782
+ "step": 1080
783
+ },
784
+ {
785
+ "epoch": 0.35591836734693877,
786
+ "grad_norm": 1.3422085046768188,
787
+ "learning_rate": 0.0005,
788
+ "loss": 4.1209,
789
+ "step": 1090
790
+ },
791
+ {
792
+ "epoch": 0.35918367346938773,
793
+ "grad_norm": 1.8398027420043945,
794
+ "learning_rate": 0.0005,
795
+ "loss": 4.0818,
796
+ "step": 1100
797
+ },
798
+ {
799
+ "epoch": 0.36244897959183675,
800
+ "grad_norm": 1.0788570642471313,
801
+ "learning_rate": 0.0005,
802
+ "loss": 3.8917,
803
+ "step": 1110
804
+ },
805
+ {
806
+ "epoch": 0.3657142857142857,
807
+ "grad_norm": 1.3736050128936768,
808
+ "learning_rate": 0.0005,
809
+ "loss": 4.2967,
810
+ "step": 1120
811
+ },
812
+ {
813
+ "epoch": 0.3689795918367347,
814
+ "grad_norm": 1.249338150024414,
815
+ "learning_rate": 0.0005,
816
+ "loss": 4.1155,
817
+ "step": 1130
818
+ },
819
+ {
820
+ "epoch": 0.3722448979591837,
821
+ "grad_norm": 1.3408821821212769,
822
+ "learning_rate": 0.0005,
823
+ "loss": 4.0186,
824
+ "step": 1140
825
+ },
826
+ {
827
+ "epoch": 0.37551020408163266,
828
+ "grad_norm": 1.4768186807632446,
829
+ "learning_rate": 0.0005,
830
+ "loss": 3.9932,
831
+ "step": 1150
832
+ },
833
+ {
834
+ "epoch": 0.3787755102040816,
835
+ "grad_norm": 1.274889349937439,
836
+ "learning_rate": 0.0005,
837
+ "loss": 3.7688,
838
+ "step": 1160
839
+ },
840
+ {
841
+ "epoch": 0.3820408163265306,
842
+ "grad_norm": 1.17601478099823,
843
+ "learning_rate": 0.0005,
844
+ "loss": 4.1915,
845
+ "step": 1170
846
+ },
847
+ {
848
+ "epoch": 0.3853061224489796,
849
+ "grad_norm": 1.2624982595443726,
850
+ "learning_rate": 0.0005,
851
+ "loss": 4.0991,
852
+ "step": 1180
853
+ },
854
+ {
855
+ "epoch": 0.38857142857142857,
856
+ "grad_norm": 1.4248754978179932,
857
+ "learning_rate": 0.0005,
858
+ "loss": 4.0275,
859
+ "step": 1190
860
+ },
861
+ {
862
+ "epoch": 0.39183673469387753,
863
+ "grad_norm": 1.8577288389205933,
864
+ "learning_rate": 0.0005,
865
+ "loss": 3.9708,
866
+ "step": 1200
867
+ },
868
+ {
869
+ "epoch": 0.39510204081632655,
870
+ "grad_norm": 1.2153960466384888,
871
+ "learning_rate": 0.0005,
872
+ "loss": 3.8195,
873
+ "step": 1210
874
+ },
875
+ {
876
+ "epoch": 0.3983673469387755,
877
+ "grad_norm": 1.238071084022522,
878
+ "learning_rate": 0.0005,
879
+ "loss": 4.3136,
880
+ "step": 1220
881
+ },
882
+ {
883
+ "epoch": 0.4016326530612245,
884
+ "grad_norm": 1.4334009885787964,
885
+ "learning_rate": 0.0005,
886
+ "loss": 4.1319,
887
+ "step": 1230
888
+ },
889
+ {
890
+ "epoch": 0.4048979591836735,
891
+ "grad_norm": 1.3810005187988281,
892
+ "learning_rate": 0.0005,
893
+ "loss": 4.0287,
894
+ "step": 1240
895
+ },
896
+ {
897
+ "epoch": 0.40816326530612246,
898
+ "grad_norm": 2.1688148975372314,
899
+ "learning_rate": 0.0005,
900
+ "loss": 4.0158,
901
+ "step": 1250
902
+ },
903
+ {
904
+ "epoch": 0.4114285714285714,
905
+ "grad_norm": 1.2777838706970215,
906
+ "learning_rate": 0.0005,
907
+ "loss": 3.8439,
908
+ "step": 1260
909
+ },
910
+ {
911
+ "epoch": 0.4146938775510204,
912
+ "grad_norm": 1.1235041618347168,
913
+ "learning_rate": 0.0005,
914
+ "loss": 4.2695,
915
+ "step": 1270
916
+ },
917
+ {
918
+ "epoch": 0.4179591836734694,
919
+ "grad_norm": 1.2477576732635498,
920
+ "learning_rate": 0.0005,
921
+ "loss": 4.0986,
922
+ "step": 1280
923
+ },
924
+ {
925
+ "epoch": 0.42122448979591837,
926
+ "grad_norm": 1.2333978414535522,
927
+ "learning_rate": 0.0005,
928
+ "loss": 4.007,
929
+ "step": 1290
930
+ },
931
+ {
932
+ "epoch": 0.42448979591836733,
933
+ "grad_norm": 2.194262981414795,
934
+ "learning_rate": 0.0005,
935
+ "loss": 3.9411,
936
+ "step": 1300
937
+ },
938
+ {
939
+ "epoch": 0.42775510204081635,
940
+ "grad_norm": 1.084908366203308,
941
+ "learning_rate": 0.0005,
942
+ "loss": 3.6208,
943
+ "step": 1310
944
+ },
945
+ {
946
+ "epoch": 0.4310204081632653,
947
+ "grad_norm": 1.275455117225647,
948
+ "learning_rate": 0.0005,
949
+ "loss": 4.2136,
950
+ "step": 1320
951
+ },
952
+ {
953
+ "epoch": 0.4342857142857143,
954
+ "grad_norm": 1.4374445676803589,
955
+ "learning_rate": 0.0005,
956
+ "loss": 4.0137,
957
+ "step": 1330
958
+ },
959
+ {
960
+ "epoch": 0.43755102040816324,
961
+ "grad_norm": 1.3499337434768677,
962
+ "learning_rate": 0.0005,
963
+ "loss": 3.8929,
964
+ "step": 1340
965
+ },
966
+ {
967
+ "epoch": 0.44081632653061226,
968
+ "grad_norm": 1.9091603755950928,
969
+ "learning_rate": 0.0005,
970
+ "loss": 3.8582,
971
+ "step": 1350
972
+ },
973
+ {
974
+ "epoch": 0.4440816326530612,
975
+ "grad_norm": 1.0902653932571411,
976
+ "learning_rate": 0.0005,
977
+ "loss": 3.6813,
978
+ "step": 1360
979
+ },
980
+ {
981
+ "epoch": 0.4473469387755102,
982
+ "grad_norm": 1.1447824239730835,
983
+ "learning_rate": 0.0005,
984
+ "loss": 4.236,
985
+ "step": 1370
986
+ },
987
+ {
988
+ "epoch": 0.4506122448979592,
989
+ "grad_norm": 1.4679241180419922,
990
+ "learning_rate": 0.0005,
991
+ "loss": 3.9569,
992
+ "step": 1380
993
+ },
994
+ {
995
+ "epoch": 0.45387755102040817,
996
+ "grad_norm": 1.7417553663253784,
997
+ "learning_rate": 0.0005,
998
+ "loss": 3.8949,
999
+ "step": 1390
1000
+ },
1001
+ {
1002
+ "epoch": 0.45714285714285713,
1003
+ "grad_norm": 1.7916682958602905,
1004
+ "learning_rate": 0.0005,
1005
+ "loss": 3.8576,
1006
+ "step": 1400
1007
+ },
1008
+ {
1009
+ "epoch": 0.46040816326530615,
1010
+ "grad_norm": 1.053566813468933,
1011
+ "learning_rate": 0.0005,
1012
+ "loss": 3.6635,
1013
+ "step": 1410
1014
+ },
1015
+ {
1016
+ "epoch": 0.4636734693877551,
1017
+ "grad_norm": 1.0085692405700684,
1018
+ "learning_rate": 0.0005,
1019
+ "loss": 4.11,
1020
+ "step": 1420
1021
+ },
1022
+ {
1023
+ "epoch": 0.4669387755102041,
1024
+ "grad_norm": 1.3383585214614868,
1025
+ "learning_rate": 0.0005,
1026
+ "loss": 3.9825,
1027
+ "step": 1430
1028
+ },
1029
+ {
1030
+ "epoch": 0.47020408163265304,
1031
+ "grad_norm": 1.3227241039276123,
1032
+ "learning_rate": 0.0005,
1033
+ "loss": 3.8703,
1034
+ "step": 1440
1035
+ },
1036
+ {
1037
+ "epoch": 0.47346938775510206,
1038
+ "grad_norm": 2.0333919525146484,
1039
+ "learning_rate": 0.0005,
1040
+ "loss": 3.8328,
1041
+ "step": 1450
1042
+ },
1043
+ {
1044
+ "epoch": 0.476734693877551,
1045
+ "grad_norm": 1.2266664505004883,
1046
+ "learning_rate": 0.0005,
1047
+ "loss": 3.6192,
1048
+ "step": 1460
1049
+ },
1050
+ {
1051
+ "epoch": 0.48,
1052
+ "grad_norm": 1.3952326774597168,
1053
+ "learning_rate": 0.0005,
1054
+ "loss": 4.149,
1055
+ "step": 1470
1056
+ },
1057
+ {
1058
+ "epoch": 0.483265306122449,
1059
+ "grad_norm": 1.4793697595596313,
1060
+ "learning_rate": 0.0005,
1061
+ "loss": 4.0945,
1062
+ "step": 1480
1063
+ },
1064
+ {
1065
+ "epoch": 0.48653061224489796,
1066
+ "grad_norm": 1.4412100315093994,
1067
+ "learning_rate": 0.0005,
1068
+ "loss": 3.8816,
1069
+ "step": 1490
1070
+ },
1071
+ {
1072
+ "epoch": 0.4897959183673469,
1073
+ "grad_norm": 1.8379197120666504,
1074
+ "learning_rate": 0.0005,
1075
+ "loss": 3.766,
1076
+ "step": 1500
1077
+ },
1078
+ {
1079
+ "epoch": 0.4897959183673469,
1080
+ "eval_loss": 3.902669906616211,
1081
+ "eval_runtime": 43.0604,
1082
+ "eval_samples_per_second": 46.446,
1083
+ "eval_steps_per_second": 11.612,
1084
+ "step": 1500
1085
+ },
1086
+ {
1087
+ "epoch": 0.4930612244897959,
1088
+ "grad_norm": 1.1561907529830933,
1089
+ "learning_rate": 0.0005,
1090
+ "loss": 3.5564,
1091
+ "step": 1510
1092
+ },
1093
+ {
1094
+ "epoch": 0.4963265306122449,
1095
+ "grad_norm": 1.1232249736785889,
1096
+ "learning_rate": 0.0005,
1097
+ "loss": 4.1061,
1098
+ "step": 1520
1099
+ },
1100
+ {
1101
+ "epoch": 0.4995918367346939,
1102
+ "grad_norm": 1.1983751058578491,
1103
+ "learning_rate": 0.0005,
1104
+ "loss": 4.0355,
1105
+ "step": 1530
1106
+ },
1107
+ {
1108
+ "epoch": 0.5028571428571429,
1109
+ "grad_norm": 1.4547094106674194,
1110
+ "learning_rate": 0.0005,
1111
+ "loss": 3.9718,
1112
+ "step": 1540
1113
+ },
1114
+ {
1115
+ "epoch": 0.5061224489795918,
1116
+ "grad_norm": 2.0942959785461426,
1117
+ "learning_rate": 0.0005,
1118
+ "loss": 3.8531,
1119
+ "step": 1550
1120
+ },
1121
+ {
1122
+ "epoch": 0.5093877551020408,
1123
+ "grad_norm": 1.4254790544509888,
1124
+ "learning_rate": 0.0005,
1125
+ "loss": 3.6136,
1126
+ "step": 1560
1127
+ },
1128
+ {
1129
+ "epoch": 0.5126530612244898,
1130
+ "grad_norm": 1.3434633016586304,
1131
+ "learning_rate": 0.0005,
1132
+ "loss": 4.0861,
1133
+ "step": 1570
1134
+ },
1135
+ {
1136
+ "epoch": 0.5159183673469387,
1137
+ "grad_norm": 1.2875784635543823,
1138
+ "learning_rate": 0.0005,
1139
+ "loss": 3.9535,
1140
+ "step": 1580
1141
+ },
1142
+ {
1143
+ "epoch": 0.5191836734693878,
1144
+ "grad_norm": 1.324489951133728,
1145
+ "learning_rate": 0.0005,
1146
+ "loss": 3.7898,
1147
+ "step": 1590
1148
+ },
1149
+ {
1150
+ "epoch": 0.5224489795918368,
1151
+ "grad_norm": 1.9661375284194946,
1152
+ "learning_rate": 0.0005,
1153
+ "loss": 3.8496,
1154
+ "step": 1600
1155
+ },
1156
+ {
1157
+ "epoch": 0.5257142857142857,
1158
+ "grad_norm": 1.1303874254226685,
1159
+ "learning_rate": 0.0005,
1160
+ "loss": 3.5894,
1161
+ "step": 1610
1162
+ },
1163
+ {
1164
+ "epoch": 0.5289795918367347,
1165
+ "grad_norm": 1.2739813327789307,
1166
+ "learning_rate": 0.0005,
1167
+ "loss": 4.0466,
1168
+ "step": 1620
1169
+ },
1170
+ {
1171
+ "epoch": 0.5322448979591837,
1172
+ "grad_norm": 1.3381832838058472,
1173
+ "learning_rate": 0.0005,
1174
+ "loss": 3.9463,
1175
+ "step": 1630
1176
+ },
1177
+ {
1178
+ "epoch": 0.5355102040816326,
1179
+ "grad_norm": 1.3114221096038818,
1180
+ "learning_rate": 0.0005,
1181
+ "loss": 3.8021,
1182
+ "step": 1640
1183
+ },
1184
+ {
1185
+ "epoch": 0.5387755102040817,
1186
+ "grad_norm": 1.9492048025131226,
1187
+ "learning_rate": 0.0005,
1188
+ "loss": 3.7187,
1189
+ "step": 1650
1190
+ },
1191
+ {
1192
+ "epoch": 0.5420408163265306,
1193
+ "grad_norm": 1.1877042055130005,
1194
+ "learning_rate": 0.0005,
1195
+ "loss": 3.5491,
1196
+ "step": 1660
1197
+ },
1198
+ {
1199
+ "epoch": 0.5453061224489796,
1200
+ "grad_norm": 1.1403299570083618,
1201
+ "learning_rate": 0.0005,
1202
+ "loss": 4.0365,
1203
+ "step": 1670
1204
+ },
1205
+ {
1206
+ "epoch": 0.5485714285714286,
1207
+ "grad_norm": 1.369149088859558,
1208
+ "learning_rate": 0.0005,
1209
+ "loss": 3.8219,
1210
+ "step": 1680
1211
+ },
1212
+ {
1213
+ "epoch": 0.5518367346938775,
1214
+ "grad_norm": 1.3963128328323364,
1215
+ "learning_rate": 0.0005,
1216
+ "loss": 3.9018,
1217
+ "step": 1690
1218
+ },
1219
+ {
1220
+ "epoch": 0.5551020408163265,
1221
+ "grad_norm": 1.9626106023788452,
1222
+ "learning_rate": 0.0005,
1223
+ "loss": 3.8125,
1224
+ "step": 1700
1225
+ },
1226
+ {
1227
+ "epoch": 0.5583673469387755,
1228
+ "grad_norm": 1.3681797981262207,
1229
+ "learning_rate": 0.0005,
1230
+ "loss": 3.5254,
1231
+ "step": 1710
1232
+ },
1233
+ {
1234
+ "epoch": 0.5616326530612245,
1235
+ "grad_norm": 1.3346383571624756,
1236
+ "learning_rate": 0.0005,
1237
+ "loss": 4.1034,
1238
+ "step": 1720
1239
+ },
1240
+ {
1241
+ "epoch": 0.5648979591836735,
1242
+ "grad_norm": 1.2607356309890747,
1243
+ "learning_rate": 0.0005,
1244
+ "loss": 3.8995,
1245
+ "step": 1730
1246
+ },
1247
+ {
1248
+ "epoch": 0.5681632653061225,
1249
+ "grad_norm": 1.4595521688461304,
1250
+ "learning_rate": 0.0005,
1251
+ "loss": 3.7525,
1252
+ "step": 1740
1253
+ },
1254
+ {
1255
+ "epoch": 0.5714285714285714,
1256
+ "grad_norm": 2.0250532627105713,
1257
+ "learning_rate": 0.0005,
1258
+ "loss": 3.8421,
1259
+ "step": 1750
1260
+ },
1261
+ {
1262
+ "epoch": 0.5746938775510204,
1263
+ "grad_norm": 1.277748942375183,
1264
+ "learning_rate": 0.0005,
1265
+ "loss": 3.6177,
1266
+ "step": 1760
1267
+ },
1268
+ {
1269
+ "epoch": 0.5779591836734694,
1270
+ "grad_norm": 1.3413293361663818,
1271
+ "learning_rate": 0.0005,
1272
+ "loss": 4.0805,
1273
+ "step": 1770
1274
+ },
1275
+ {
1276
+ "epoch": 0.5812244897959183,
1277
+ "grad_norm": 1.1629762649536133,
1278
+ "learning_rate": 0.0005,
1279
+ "loss": 3.9218,
1280
+ "step": 1780
1281
+ },
1282
+ {
1283
+ "epoch": 0.5844897959183674,
1284
+ "grad_norm": 1.3503767251968384,
1285
+ "learning_rate": 0.0005,
1286
+ "loss": 3.7097,
1287
+ "step": 1790
1288
+ },
1289
+ {
1290
+ "epoch": 0.5877551020408164,
1291
+ "grad_norm": 1.966658353805542,
1292
+ "learning_rate": 0.0005,
1293
+ "loss": 3.8113,
1294
+ "step": 1800
1295
+ },
1296
+ {
1297
+ "epoch": 0.5910204081632653,
1298
+ "grad_norm": 1.1706182956695557,
1299
+ "learning_rate": 0.0005,
1300
+ "loss": 3.6205,
1301
+ "step": 1810
1302
+ },
1303
+ {
1304
+ "epoch": 0.5942857142857143,
1305
+ "grad_norm": 1.095694661140442,
1306
+ "learning_rate": 0.0005,
1307
+ "loss": 3.9975,
1308
+ "step": 1820
1309
+ },
1310
+ {
1311
+ "epoch": 0.5975510204081632,
1312
+ "grad_norm": 1.3853328227996826,
1313
+ "learning_rate": 0.0005,
1314
+ "loss": 3.8375,
1315
+ "step": 1830
1316
+ },
1317
+ {
1318
+ "epoch": 0.6008163265306122,
1319
+ "grad_norm": 1.396588683128357,
1320
+ "learning_rate": 0.0005,
1321
+ "loss": 3.8137,
1322
+ "step": 1840
1323
+ },
1324
+ {
1325
+ "epoch": 0.6040816326530613,
1326
+ "grad_norm": 1.8910547494888306,
1327
+ "learning_rate": 0.0005,
1328
+ "loss": 3.8027,
1329
+ "step": 1850
1330
+ },
1331
+ {
1332
+ "epoch": 0.6073469387755102,
1333
+ "grad_norm": 1.1377391815185547,
1334
+ "learning_rate": 0.0005,
1335
+ "loss": 3.4935,
1336
+ "step": 1860
1337
+ },
1338
+ {
1339
+ "epoch": 0.6106122448979592,
1340
+ "grad_norm": 1.1120401620864868,
1341
+ "learning_rate": 0.0005,
1342
+ "loss": 4.0356,
1343
+ "step": 1870
1344
+ },
1345
+ {
1346
+ "epoch": 0.6138775510204082,
1347
+ "grad_norm": 1.2026475667953491,
1348
+ "learning_rate": 0.0005,
1349
+ "loss": 3.8108,
1350
+ "step": 1880
1351
+ },
1352
+ {
1353
+ "epoch": 0.6171428571428571,
1354
+ "grad_norm": 1.4205538034439087,
1355
+ "learning_rate": 0.0005,
1356
+ "loss": 3.731,
1357
+ "step": 1890
1358
+ },
1359
+ {
1360
+ "epoch": 0.6204081632653061,
1361
+ "grad_norm": 1.925189733505249,
1362
+ "learning_rate": 0.0005,
1363
+ "loss": 3.7237,
1364
+ "step": 1900
1365
+ },
1366
+ {
1367
+ "epoch": 0.6236734693877551,
1368
+ "grad_norm": 1.1621805429458618,
1369
+ "learning_rate": 0.0005,
1370
+ "loss": 3.5831,
1371
+ "step": 1910
1372
+ },
1373
+ {
1374
+ "epoch": 0.626938775510204,
1375
+ "grad_norm": 1.2125190496444702,
1376
+ "learning_rate": 0.0005,
1377
+ "loss": 3.9736,
1378
+ "step": 1920
1379
+ },
1380
+ {
1381
+ "epoch": 0.6302040816326531,
1382
+ "grad_norm": 1.4116473197937012,
1383
+ "learning_rate": 0.0005,
1384
+ "loss": 3.8547,
1385
+ "step": 1930
1386
+ },
1387
+ {
1388
+ "epoch": 0.6334693877551021,
1389
+ "grad_norm": 1.5884093046188354,
1390
+ "learning_rate": 0.0005,
1391
+ "loss": 3.7234,
1392
+ "step": 1940
1393
+ },
1394
+ {
1395
+ "epoch": 0.636734693877551,
1396
+ "grad_norm": 1.875867486000061,
1397
+ "learning_rate": 0.0005,
1398
+ "loss": 3.7336,
1399
+ "step": 1950
1400
+ },
1401
+ {
1402
+ "epoch": 0.64,
1403
+ "grad_norm": 1.1436996459960938,
1404
+ "learning_rate": 0.0005,
1405
+ "loss": 3.4685,
1406
+ "step": 1960
1407
+ },
1408
+ {
1409
+ "epoch": 0.643265306122449,
1410
+ "grad_norm": 1.4208112955093384,
1411
+ "learning_rate": 0.0005,
1412
+ "loss": 4.0489,
1413
+ "step": 1970
1414
+ },
1415
+ {
1416
+ "epoch": 0.6465306122448979,
1417
+ "grad_norm": 1.3270920515060425,
1418
+ "learning_rate": 0.0005,
1419
+ "loss": 3.8098,
1420
+ "step": 1980
1421
+ },
1422
+ {
1423
+ "epoch": 0.649795918367347,
1424
+ "grad_norm": 1.6381317377090454,
1425
+ "learning_rate": 0.0005,
1426
+ "loss": 3.6228,
1427
+ "step": 1990
1428
+ },
1429
+ {
1430
+ "epoch": 0.6530612244897959,
1431
+ "grad_norm": 2.0222079753875732,
1432
+ "learning_rate": 0.0005,
1433
+ "loss": 3.678,
1434
+ "step": 2000
1435
+ },
1436
+ {
1437
+ "epoch": 0.6530612244897959,
1438
+ "eval_loss": 3.763411521911621,
1439
+ "eval_runtime": 43.2803,
1440
+ "eval_samples_per_second": 46.21,
1441
+ "eval_steps_per_second": 11.553,
1442
+ "step": 2000
1443
+ },
1444
+ {
1445
+ "epoch": 0.6563265306122449,
1446
+ "grad_norm": 1.211680293083191,
1447
+ "learning_rate": 0.0005,
1448
+ "loss": 3.5285,
1449
+ "step": 2010
1450
+ },
1451
+ {
1452
+ "epoch": 0.6595918367346939,
1453
+ "grad_norm": 1.27492094039917,
1454
+ "learning_rate": 0.0005,
1455
+ "loss": 3.977,
1456
+ "step": 2020
1457
+ },
1458
+ {
1459
+ "epoch": 0.6628571428571428,
1460
+ "grad_norm": 1.4437798261642456,
1461
+ "learning_rate": 0.0005,
1462
+ "loss": 3.8207,
1463
+ "step": 2030
1464
+ },
1465
+ {
1466
+ "epoch": 0.6661224489795918,
1467
+ "grad_norm": 1.3550224304199219,
1468
+ "learning_rate": 0.0005,
1469
+ "loss": 3.6058,
1470
+ "step": 2040
1471
+ },
1472
+ {
1473
+ "epoch": 0.6693877551020408,
1474
+ "grad_norm": 2.0937228202819824,
1475
+ "learning_rate": 0.0005,
1476
+ "loss": 3.6986,
1477
+ "step": 2050
1478
+ },
1479
+ {
1480
+ "epoch": 0.6726530612244898,
1481
+ "grad_norm": 1.0755195617675781,
1482
+ "learning_rate": 0.0005,
1483
+ "loss": 3.4733,
1484
+ "step": 2060
1485
+ },
1486
+ {
1487
+ "epoch": 0.6759183673469388,
1488
+ "grad_norm": 1.157943606376648,
1489
+ "learning_rate": 0.0005,
1490
+ "loss": 3.9306,
1491
+ "step": 2070
1492
+ },
1493
+ {
1494
+ "epoch": 0.6791836734693878,
1495
+ "grad_norm": 1.1048219203948975,
1496
+ "learning_rate": 0.0005,
1497
+ "loss": 3.7846,
1498
+ "step": 2080
1499
+ },
1500
+ {
1501
+ "epoch": 0.6824489795918367,
1502
+ "grad_norm": 1.385745882987976,
1503
+ "learning_rate": 0.0005,
1504
+ "loss": 3.804,
1505
+ "step": 2090
1506
+ },
1507
+ {
1508
+ "epoch": 0.6857142857142857,
1509
+ "grad_norm": 1.8955817222595215,
1510
+ "learning_rate": 0.0005,
1511
+ "loss": 3.6585,
1512
+ "step": 2100
1513
+ },
1514
+ {
1515
+ "epoch": 0.6889795918367347,
1516
+ "grad_norm": 1.0233700275421143,
1517
+ "learning_rate": 0.0005,
1518
+ "loss": 3.4444,
1519
+ "step": 2110
1520
+ },
1521
+ {
1522
+ "epoch": 0.6922448979591836,
1523
+ "grad_norm": 1.180027723312378,
1524
+ "learning_rate": 0.0005,
1525
+ "loss": 3.9621,
1526
+ "step": 2120
1527
+ },
1528
+ {
1529
+ "epoch": 0.6955102040816327,
1530
+ "grad_norm": 1.2139476537704468,
1531
+ "learning_rate": 0.0005,
1532
+ "loss": 3.785,
1533
+ "step": 2130
1534
+ },
1535
+ {
1536
+ "epoch": 0.6987755102040817,
1537
+ "grad_norm": 1.285551905632019,
1538
+ "learning_rate": 0.0005,
1539
+ "loss": 3.7749,
1540
+ "step": 2140
1541
+ },
1542
+ {
1543
+ "epoch": 0.7020408163265306,
1544
+ "grad_norm": 1.9963250160217285,
1545
+ "learning_rate": 0.0005,
1546
+ "loss": 3.6162,
1547
+ "step": 2150
1548
+ },
1549
+ {
1550
+ "epoch": 0.7053061224489796,
1551
+ "grad_norm": 1.216117262840271,
1552
+ "learning_rate": 0.0005,
1553
+ "loss": 3.5505,
1554
+ "step": 2160
1555
+ },
1556
+ {
1557
+ "epoch": 0.7085714285714285,
1558
+ "grad_norm": 1.181820034980774,
1559
+ "learning_rate": 0.0005,
1560
+ "loss": 4.0493,
1561
+ "step": 2170
1562
+ },
1563
+ {
1564
+ "epoch": 0.7118367346938775,
1565
+ "grad_norm": 1.181449294090271,
1566
+ "learning_rate": 0.0005,
1567
+ "loss": 3.7653,
1568
+ "step": 2180
1569
+ },
1570
+ {
1571
+ "epoch": 0.7151020408163266,
1572
+ "grad_norm": 1.350578784942627,
1573
+ "learning_rate": 0.0005,
1574
+ "loss": 3.7466,
1575
+ "step": 2190
1576
+ },
1577
+ {
1578
+ "epoch": 0.7183673469387755,
1579
+ "grad_norm": 2.2694125175476074,
1580
+ "learning_rate": 0.0005,
1581
+ "loss": 3.6147,
1582
+ "step": 2200
1583
+ },
1584
+ {
1585
+ "epoch": 0.7216326530612245,
1586
+ "grad_norm": 1.4255095720291138,
1587
+ "learning_rate": 0.0005,
1588
+ "loss": 3.5044,
1589
+ "step": 2210
1590
+ },
1591
+ {
1592
+ "epoch": 0.7248979591836735,
1593
+ "grad_norm": 1.037293791770935,
1594
+ "learning_rate": 0.0005,
1595
+ "loss": 3.8757,
1596
+ "step": 2220
1597
+ },
1598
+ {
1599
+ "epoch": 0.7281632653061224,
1600
+ "grad_norm": 1.4334427118301392,
1601
+ "learning_rate": 0.0005,
1602
+ "loss": 3.856,
1603
+ "step": 2230
1604
+ },
1605
+ {
1606
+ "epoch": 0.7314285714285714,
1607
+ "grad_norm": 1.3529642820358276,
1608
+ "learning_rate": 0.0005,
1609
+ "loss": 3.6443,
1610
+ "step": 2240
1611
+ },
1612
+ {
1613
+ "epoch": 0.7346938775510204,
1614
+ "grad_norm": 2.0167791843414307,
1615
+ "learning_rate": 0.0005,
1616
+ "loss": 3.6821,
1617
+ "step": 2250
1618
+ },
1619
+ {
1620
+ "epoch": 0.7379591836734694,
1621
+ "grad_norm": 1.011146068572998,
1622
+ "learning_rate": 0.0005,
1623
+ "loss": 3.2779,
1624
+ "step": 2260
1625
+ },
1626
+ {
1627
+ "epoch": 0.7412244897959184,
1628
+ "grad_norm": 1.0214072465896606,
1629
+ "learning_rate": 0.0005,
1630
+ "loss": 3.8809,
1631
+ "step": 2270
1632
+ },
1633
+ {
1634
+ "epoch": 0.7444897959183674,
1635
+ "grad_norm": 1.3080716133117676,
1636
+ "learning_rate": 0.0005,
1637
+ "loss": 3.7988,
1638
+ "step": 2280
1639
+ },
1640
+ {
1641
+ "epoch": 0.7477551020408163,
1642
+ "grad_norm": 1.4071499109268188,
1643
+ "learning_rate": 0.0005,
1644
+ "loss": 3.7096,
1645
+ "step": 2290
1646
+ },
1647
+ {
1648
+ "epoch": 0.7510204081632653,
1649
+ "grad_norm": 2.372915744781494,
1650
+ "learning_rate": 0.0005,
1651
+ "loss": 3.6902,
1652
+ "step": 2300
1653
+ },
1654
+ {
1655
+ "epoch": 0.7542857142857143,
1656
+ "grad_norm": 1.1310540437698364,
1657
+ "learning_rate": 0.0005,
1658
+ "loss": 3.5033,
1659
+ "step": 2310
1660
+ },
1661
+ {
1662
+ "epoch": 0.7575510204081632,
1663
+ "grad_norm": 1.1746577024459839,
1664
+ "learning_rate": 0.0005,
1665
+ "loss": 3.9845,
1666
+ "step": 2320
1667
+ },
1668
+ {
1669
+ "epoch": 0.7608163265306123,
1670
+ "grad_norm": 1.3103886842727661,
1671
+ "learning_rate": 0.0005,
1672
+ "loss": 3.7691,
1673
+ "step": 2330
1674
+ },
1675
+ {
1676
+ "epoch": 0.7640816326530612,
1677
+ "grad_norm": 1.3824633359909058,
1678
+ "learning_rate": 0.0005,
1679
+ "loss": 3.5941,
1680
+ "step": 2340
1681
+ },
1682
+ {
1683
+ "epoch": 0.7673469387755102,
1684
+ "grad_norm": 2.100325345993042,
1685
+ "learning_rate": 0.0005,
1686
+ "loss": 3.6041,
1687
+ "step": 2350
1688
+ },
1689
+ {
1690
+ "epoch": 0.7706122448979592,
1691
+ "grad_norm": 1.1913516521453857,
1692
+ "learning_rate": 0.0005,
1693
+ "loss": 3.3779,
1694
+ "step": 2360
1695
+ },
1696
+ {
1697
+ "epoch": 0.7738775510204081,
1698
+ "grad_norm": 0.9938827753067017,
1699
+ "learning_rate": 0.0005,
1700
+ "loss": 3.8768,
1701
+ "step": 2370
1702
+ },
1703
+ {
1704
+ "epoch": 0.7771428571428571,
1705
+ "grad_norm": 1.2203476428985596,
1706
+ "learning_rate": 0.0005,
1707
+ "loss": 3.6631,
1708
+ "step": 2380
1709
+ },
1710
+ {
1711
+ "epoch": 0.7804081632653062,
1712
+ "grad_norm": 1.2797229290008545,
1713
+ "learning_rate": 0.0005,
1714
+ "loss": 3.682,
1715
+ "step": 2390
1716
+ },
1717
+ {
1718
+ "epoch": 0.7836734693877551,
1719
+ "grad_norm": 1.820462703704834,
1720
+ "learning_rate": 0.0005,
1721
+ "loss": 3.63,
1722
+ "step": 2400
1723
+ },
1724
+ {
1725
+ "epoch": 0.7869387755102041,
1726
+ "grad_norm": 1.147829532623291,
1727
+ "learning_rate": 0.0005,
1728
+ "loss": 3.3512,
1729
+ "step": 2410
1730
+ },
1731
+ {
1732
+ "epoch": 0.7902040816326531,
1733
+ "grad_norm": 1.2591899633407593,
1734
+ "learning_rate": 0.0005,
1735
+ "loss": 3.8567,
1736
+ "step": 2420
1737
+ },
1738
+ {
1739
+ "epoch": 0.793469387755102,
1740
+ "grad_norm": 1.2597836256027222,
1741
+ "learning_rate": 0.0005,
1742
+ "loss": 3.7401,
1743
+ "step": 2430
1744
+ },
1745
+ {
1746
+ "epoch": 0.796734693877551,
1747
+ "grad_norm": 1.3258607387542725,
1748
+ "learning_rate": 0.0005,
1749
+ "loss": 3.5487,
1750
+ "step": 2440
1751
+ },
1752
+ {
1753
+ "epoch": 0.8,
1754
+ "grad_norm": 2.17008376121521,
1755
+ "learning_rate": 0.0005,
1756
+ "loss": 3.5872,
1757
+ "step": 2450
1758
+ },
1759
+ {
1760
+ "epoch": 0.803265306122449,
1761
+ "grad_norm": 1.0918580293655396,
1762
+ "learning_rate": 0.0005,
1763
+ "loss": 3.3497,
1764
+ "step": 2460
1765
+ },
1766
+ {
1767
+ "epoch": 0.806530612244898,
1768
+ "grad_norm": 1.1970785856246948,
1769
+ "learning_rate": 0.0005,
1770
+ "loss": 3.9383,
1771
+ "step": 2470
1772
+ },
1773
+ {
1774
+ "epoch": 0.809795918367347,
1775
+ "grad_norm": 1.2330458164215088,
1776
+ "learning_rate": 0.0005,
1777
+ "loss": 3.7035,
1778
+ "step": 2480
1779
+ },
1780
+ {
1781
+ "epoch": 0.8130612244897959,
1782
+ "grad_norm": 1.3976151943206787,
1783
+ "learning_rate": 0.0005,
1784
+ "loss": 3.6041,
1785
+ "step": 2490
1786
+ },
1787
+ {
1788
+ "epoch": 0.8163265306122449,
1789
+ "grad_norm": 1.8973948955535889,
1790
+ "learning_rate": 0.0005,
1791
+ "loss": 3.5651,
1792
+ "step": 2500
1793
+ },
1794
+ {
1795
+ "epoch": 0.8163265306122449,
1796
+ "eval_loss": 3.65913724899292,
1797
+ "eval_runtime": 42.7312,
1798
+ "eval_samples_per_second": 46.804,
1799
+ "eval_steps_per_second": 11.701,
1800
+ "step": 2500
1801
+ },
1802
+ {
1803
+ "epoch": 0.8195918367346938,
1804
+ "grad_norm": 1.126738429069519,
1805
+ "learning_rate": 0.0005,
1806
+ "loss": 3.3749,
1807
+ "step": 2510
1808
+ },
1809
+ {
1810
+ "epoch": 0.8228571428571428,
1811
+ "grad_norm": 1.2251478433609009,
1812
+ "learning_rate": 0.0005,
1813
+ "loss": 3.884,
1814
+ "step": 2520
1815
+ },
1816
+ {
1817
+ "epoch": 0.8261224489795919,
1818
+ "grad_norm": 1.219794750213623,
1819
+ "learning_rate": 0.0005,
1820
+ "loss": 3.6404,
1821
+ "step": 2530
1822
+ },
1823
+ {
1824
+ "epoch": 0.8293877551020408,
1825
+ "grad_norm": 1.3533365726470947,
1826
+ "learning_rate": 0.0005,
1827
+ "loss": 3.5655,
1828
+ "step": 2540
1829
+ },
1830
+ {
1831
+ "epoch": 0.8326530612244898,
1832
+ "grad_norm": 1.7649017572402954,
1833
+ "learning_rate": 0.0005,
1834
+ "loss": 3.6185,
1835
+ "step": 2550
1836
+ },
1837
+ {
1838
+ "epoch": 0.8359183673469388,
1839
+ "grad_norm": 1.0244803428649902,
1840
+ "learning_rate": 0.0005,
1841
+ "loss": 3.4428,
1842
+ "step": 2560
1843
+ },
1844
+ {
1845
+ "epoch": 0.8391836734693877,
1846
+ "grad_norm": 1.3756012916564941,
1847
+ "learning_rate": 0.0005,
1848
+ "loss": 3.8133,
1849
+ "step": 2570
1850
+ },
1851
+ {
1852
+ "epoch": 0.8424489795918367,
1853
+ "grad_norm": 1.4523794651031494,
1854
+ "learning_rate": 0.0005,
1855
+ "loss": 3.6916,
1856
+ "step": 2580
1857
+ },
1858
+ {
1859
+ "epoch": 0.8457142857142858,
1860
+ "grad_norm": 1.4527126550674438,
1861
+ "learning_rate": 0.0005,
1862
+ "loss": 3.5812,
1863
+ "step": 2590
1864
+ },
1865
+ {
1866
+ "epoch": 0.8489795918367347,
1867
+ "grad_norm": 1.7975162267684937,
1868
+ "learning_rate": 0.0005,
1869
+ "loss": 3.6506,
1870
+ "step": 2600
1871
+ },
1872
+ {
1873
+ "epoch": 0.8522448979591837,
1874
+ "grad_norm": 1.29179048538208,
1875
+ "learning_rate": 0.0005,
1876
+ "loss": 3.4787,
1877
+ "step": 2610
1878
+ },
1879
+ {
1880
+ "epoch": 0.8555102040816327,
1881
+ "grad_norm": 1.2570093870162964,
1882
+ "learning_rate": 0.0005,
1883
+ "loss": 3.933,
1884
+ "step": 2620
1885
+ },
1886
+ {
1887
+ "epoch": 0.8587755102040816,
1888
+ "grad_norm": 1.350013017654419,
1889
+ "learning_rate": 0.0005,
1890
+ "loss": 3.6734,
1891
+ "step": 2630
1892
+ },
1893
+ {
1894
+ "epoch": 0.8620408163265306,
1895
+ "grad_norm": 1.4130898714065552,
1896
+ "learning_rate": 0.0005,
1897
+ "loss": 3.4979,
1898
+ "step": 2640
1899
+ },
1900
+ {
1901
+ "epoch": 0.8653061224489796,
1902
+ "grad_norm": 2.0199756622314453,
1903
+ "learning_rate": 0.0005,
1904
+ "loss": 3.5747,
1905
+ "step": 2650
1906
+ },
1907
+ {
1908
+ "epoch": 0.8685714285714285,
1909
+ "grad_norm": 1.0159718990325928,
1910
+ "learning_rate": 0.0005,
1911
+ "loss": 3.4062,
1912
+ "step": 2660
1913
+ },
1914
+ {
1915
+ "epoch": 0.8718367346938776,
1916
+ "grad_norm": 1.2518935203552246,
1917
+ "learning_rate": 0.0005,
1918
+ "loss": 3.7682,
1919
+ "step": 2670
1920
+ },
1921
+ {
1922
+ "epoch": 0.8751020408163265,
1923
+ "grad_norm": 1.2634557485580444,
1924
+ "learning_rate": 0.0005,
1925
+ "loss": 3.7365,
1926
+ "step": 2680
1927
+ },
1928
+ {
1929
+ "epoch": 0.8783673469387755,
1930
+ "grad_norm": 1.2956312894821167,
1931
+ "learning_rate": 0.0005,
1932
+ "loss": 3.6749,
1933
+ "step": 2690
1934
+ },
1935
+ {
1936
+ "epoch": 0.8816326530612245,
1937
+ "grad_norm": 1.8405163288116455,
1938
+ "learning_rate": 0.0005,
1939
+ "loss": 3.5182,
1940
+ "step": 2700
1941
+ },
1942
+ {
1943
+ "epoch": 0.8848979591836734,
1944
+ "grad_norm": 1.1379750967025757,
1945
+ "learning_rate": 0.0005,
1946
+ "loss": 3.376,
1947
+ "step": 2710
1948
+ },
1949
+ {
1950
+ "epoch": 0.8881632653061224,
1951
+ "grad_norm": 1.3452417850494385,
1952
+ "learning_rate": 0.0005,
1953
+ "loss": 3.8657,
1954
+ "step": 2720
1955
+ },
1956
+ {
1957
+ "epoch": 0.8914285714285715,
1958
+ "grad_norm": 1.1698819398880005,
1959
+ "learning_rate": 0.0005,
1960
+ "loss": 3.633,
1961
+ "step": 2730
1962
+ },
1963
+ {
1964
+ "epoch": 0.8946938775510204,
1965
+ "grad_norm": 1.42051100730896,
1966
+ "learning_rate": 0.0005,
1967
+ "loss": 3.5665,
1968
+ "step": 2740
1969
+ },
1970
+ {
1971
+ "epoch": 0.8979591836734694,
1972
+ "grad_norm": 1.9082551002502441,
1973
+ "learning_rate": 0.0005,
1974
+ "loss": 3.5884,
1975
+ "step": 2750
1976
+ },
1977
+ {
1978
+ "epoch": 0.9012244897959184,
1979
+ "grad_norm": 1.3937710523605347,
1980
+ "learning_rate": 0.0005,
1981
+ "loss": 3.3543,
1982
+ "step": 2760
1983
+ },
1984
+ {
1985
+ "epoch": 0.9044897959183673,
1986
+ "grad_norm": 1.2848858833312988,
1987
+ "learning_rate": 0.0005,
1988
+ "loss": 3.8298,
1989
+ "step": 2770
1990
+ },
1991
+ {
1992
+ "epoch": 0.9077551020408163,
1993
+ "grad_norm": 1.2677395343780518,
1994
+ "learning_rate": 0.0005,
1995
+ "loss": 3.6608,
1996
+ "step": 2780
1997
+ },
1998
+ {
1999
+ "epoch": 0.9110204081632653,
2000
+ "grad_norm": 1.372312307357788,
2001
+ "learning_rate": 0.0005,
2002
+ "loss": 3.5296,
2003
+ "step": 2790
2004
+ },
2005
+ {
2006
+ "epoch": 0.9142857142857143,
2007
+ "grad_norm": 1.928770899772644,
2008
+ "learning_rate": 0.0005,
2009
+ "loss": 3.5219,
2010
+ "step": 2800
2011
+ },
2012
+ {
2013
+ "epoch": 0.9175510204081633,
2014
+ "grad_norm": 1.1431926488876343,
2015
+ "learning_rate": 0.0005,
2016
+ "loss": 3.3378,
2017
+ "step": 2810
2018
+ },
2019
+ {
2020
+ "epoch": 0.9208163265306123,
2021
+ "grad_norm": 1.5195988416671753,
2022
+ "learning_rate": 0.0005,
2023
+ "loss": 3.8191,
2024
+ "step": 2820
2025
+ },
2026
+ {
2027
+ "epoch": 0.9240816326530612,
2028
+ "grad_norm": 1.3690383434295654,
2029
+ "learning_rate": 0.0005,
2030
+ "loss": 3.5557,
2031
+ "step": 2830
2032
+ },
2033
+ {
2034
+ "epoch": 0.9273469387755102,
2035
+ "grad_norm": 1.473240852355957,
2036
+ "learning_rate": 0.0005,
2037
+ "loss": 3.5422,
2038
+ "step": 2840
2039
+ },
2040
+ {
2041
+ "epoch": 0.9306122448979591,
2042
+ "grad_norm": 2.0330464839935303,
2043
+ "learning_rate": 0.0005,
2044
+ "loss": 3.6382,
2045
+ "step": 2850
2046
+ },
2047
+ {
2048
+ "epoch": 0.9338775510204081,
2049
+ "grad_norm": 1.0458343029022217,
2050
+ "learning_rate": 0.0005,
2051
+ "loss": 3.3883,
2052
+ "step": 2860
2053
+ },
2054
+ {
2055
+ "epoch": 0.9371428571428572,
2056
+ "grad_norm": 1.1635581254959106,
2057
+ "learning_rate": 0.0005,
2058
+ "loss": 3.8158,
2059
+ "step": 2870
2060
+ },
2061
+ {
2062
+ "epoch": 0.9404081632653061,
2063
+ "grad_norm": 1.3448679447174072,
2064
+ "learning_rate": 0.0005,
2065
+ "loss": 3.6389,
2066
+ "step": 2880
2067
+ },
2068
+ {
2069
+ "epoch": 0.9436734693877551,
2070
+ "grad_norm": 1.3173009157180786,
2071
+ "learning_rate": 0.0005,
2072
+ "loss": 3.5386,
2073
+ "step": 2890
2074
+ },
2075
+ {
2076
+ "epoch": 0.9469387755102041,
2077
+ "grad_norm": 2.144378662109375,
2078
+ "learning_rate": 0.0005,
2079
+ "loss": 3.5892,
2080
+ "step": 2900
2081
+ },
2082
+ {
2083
+ "epoch": 0.950204081632653,
2084
+ "grad_norm": 1.1590373516082764,
2085
+ "learning_rate": 0.0005,
2086
+ "loss": 3.397,
2087
+ "step": 2910
2088
+ },
2089
+ {
2090
+ "epoch": 0.953469387755102,
2091
+ "grad_norm": 1.253893494606018,
2092
+ "learning_rate": 0.0005,
2093
+ "loss": 3.8054,
2094
+ "step": 2920
2095
+ },
2096
+ {
2097
+ "epoch": 0.9567346938775511,
2098
+ "grad_norm": 1.215649962425232,
2099
+ "learning_rate": 0.0005,
2100
+ "loss": 3.612,
2101
+ "step": 2930
2102
+ },
2103
+ {
2104
+ "epoch": 0.96,
2105
+ "grad_norm": 1.4085159301757812,
2106
+ "learning_rate": 0.0005,
2107
+ "loss": 3.5056,
2108
+ "step": 2940
2109
+ },
2110
+ {
2111
+ "epoch": 0.963265306122449,
2112
+ "grad_norm": 1.908341646194458,
2113
+ "learning_rate": 0.0005,
2114
+ "loss": 3.4516,
2115
+ "step": 2950
2116
+ },
2117
+ {
2118
+ "epoch": 0.966530612244898,
2119
+ "grad_norm": 1.1971689462661743,
2120
+ "learning_rate": 0.0005,
2121
+ "loss": 3.3546,
2122
+ "step": 2960
2123
+ },
2124
+ {
2125
+ "epoch": 0.9697959183673469,
2126
+ "grad_norm": 1.2646092176437378,
2127
+ "learning_rate": 0.0005,
2128
+ "loss": 3.8237,
2129
+ "step": 2970
2130
+ },
2131
+ {
2132
+ "epoch": 0.9730612244897959,
2133
+ "grad_norm": 1.3261162042617798,
2134
+ "learning_rate": 0.0005,
2135
+ "loss": 3.6123,
2136
+ "step": 2980
2137
+ },
2138
+ {
2139
+ "epoch": 0.976326530612245,
2140
+ "grad_norm": 1.4025673866271973,
2141
+ "learning_rate": 0.0005,
2142
+ "loss": 3.5225,
2143
+ "step": 2990
2144
+ },
2145
+ {
2146
+ "epoch": 0.9795918367346939,
2147
+ "grad_norm": 1.9254060983657837,
2148
+ "learning_rate": 0.0005,
2149
+ "loss": 3.5353,
2150
+ "step": 3000
2151
+ },
2152
+ {
2153
+ "epoch": 0.9795918367346939,
2154
+ "eval_loss": 3.575631618499756,
2155
+ "eval_runtime": 42.8247,
2156
+ "eval_samples_per_second": 46.702,
2157
+ "eval_steps_per_second": 11.675,
2158
+ "step": 3000
2159
+ },
2160
+ {
2161
+ "epoch": 0.9828571428571429,
2162
+ "grad_norm": 1.1695542335510254,
2163
+ "learning_rate": 0.0005,
2164
+ "loss": 3.1419,
2165
+ "step": 3010
2166
+ },
2167
+ {
2168
+ "epoch": 0.9861224489795918,
2169
+ "grad_norm": 1.1839579343795776,
2170
+ "learning_rate": 0.0005,
2171
+ "loss": 3.7104,
2172
+ "step": 3020
2173
+ },
2174
+ {
2175
+ "epoch": 0.9893877551020408,
2176
+ "grad_norm": 1.3528823852539062,
2177
+ "learning_rate": 0.0005,
2178
+ "loss": 3.6772,
2179
+ "step": 3030
2180
+ },
2181
+ {
2182
+ "epoch": 0.9926530612244898,
2183
+ "grad_norm": 1.42439603805542,
2184
+ "learning_rate": 0.0005,
2185
+ "loss": 3.544,
2186
+ "step": 3040
2187
+ },
2188
+ {
2189
+ "epoch": 0.9959183673469387,
2190
+ "grad_norm": 2.387221574783325,
2191
+ "learning_rate": 0.0005,
2192
+ "loss": 3.536,
2193
+ "step": 3050
2194
+ },
2195
+ {
2196
+ "epoch": 0.9991836734693877,
2197
+ "grad_norm": 1.4861875772476196,
2198
+ "learning_rate": 0.0005,
2199
+ "loss": 3.5749,
2200
+ "step": 3060
2201
+ }
2202
+ ],
2203
+ "logging_steps": 10,
2204
+ "max_steps": 3063,
2205
+ "num_input_tokens_seen": 0,
2206
+ "num_train_epochs": 1,
2207
+ "save_steps": 500,
2208
+ "stateful_callbacks": {
2209
+ "TrainerControl": {
2210
+ "args": {
2211
+ "should_epoch_stop": false,
2212
+ "should_evaluate": false,
2213
+ "should_log": false,
2214
+ "should_save": true,
2215
+ "should_training_stop": true
2216
+ },
2217
+ "attributes": {}
2218
+ }
2219
+ },
2220
+ "total_flos": 28910317824000.0,
2221
+ "train_batch_size": 8,
2222
+ "trial_name": null,
2223
+ "trial_params": null
2224
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff