RikoteMaster commited on
Commit
979685b
·
verified ·
1 Parent(s): 0dd04f4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0]['role'] == 'system' %}
2
+ {%- set system_message = messages[0]['content'] %}
3
+ {%- set messages = messages[1:] %}
4
+ {%- else %}
5
+ {%- set system_message = 'You are a helpful assistant, that answer STEM questions. Here is the format in which you are supposed to answer: \nBelow you are provided with three example questions and the expected answer format you should give. Just answer with A, B, C, or D.\n\nThe following are multiple choice questions (with answers) about knowledge and skills in advanced master-level STEM courses.\n\nPerformance enhancing synthetic steroids are based on the structure of the hormone:\nA. testosterone.\nB. cortisol.\nC. progesterone.\nD. aldosterone.\nAnswer:A\n\nThe following are multiple choice questions (with answers) about knowledge and skills in advanced master-level STEM courses.\nAsp235Phe in a molecular report indicates that:\nA. asparagine has been replaced by phenylalanine.\nB. phenylalanine has been replaced by asparagine.\nC. aspartic acid has been replaced by phenylalanine.\nD. phenylalanine has been replaced by aspartic acid.\nAnswer:C\n\nThe following are multiple choice questions (with answers) about knowledge and skills in advanced master-level STEM courses.\nThe concept of V/f control of inverters driving induction motors results in:\nA. constant torque operation \nB. speed reversal \nC. reduced magnetic loss \nD. harmonic elimination \nAnswer:A\n\nAnswer the following question in the same way:' %}
6
+ {%- endif %}
7
+ {{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}
8
+ {%- for message in messages %}
9
+ {{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
10
+ {%- endfor %}
11
+ {%- if add_generation_prompt %}
12
+ {{ '<|im_start|>assistant\n' }}
13
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.3",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.52.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cae9fb38b5cd7572bca671ebd993033b3c55690ae74ce227f1567c00de2409f3
3
+ size 1192135096
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322e1f70953507baa30c42a4991d4c48dec2cca979d24af22fca173b78b0c4f0
3
+ size 2384460363
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c0b314d655e9be31e93455deb9fbabd83362b86e9c047746bf9788399d89574
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a142a610e622c0db621d1f04872376e43d9d8e788468c7ecf51eee294b28b92
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|endoftext|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
trainer_state.json ADDED
@@ -0,0 +1,1625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 875,
3
+ "best_metric": 0.41610679030418396,
4
+ "best_model_checkpoint": "/home/ricoiban/GEMMA/mnlp_chatsplaining/results_model/try_ft/checkpoint-875",
5
+ "epoch": 0.5460448642266824,
6
+ "eval_steps": 25,
7
+ "global_step": 925,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0029515938606847697,
14
+ "grad_norm": 274.0,
15
+ "learning_rate": 4.800000000000001e-06,
16
+ "loss": 10.7873,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.0059031877213695395,
21
+ "grad_norm": 160.0,
22
+ "learning_rate": 1.08e-05,
23
+ "loss": 8.2818,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.00885478158205431,
28
+ "grad_norm": 96.0,
29
+ "learning_rate": 1.6800000000000002e-05,
30
+ "loss": 5.4171,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.011806375442739079,
35
+ "grad_norm": 141.0,
36
+ "learning_rate": 2.2800000000000002e-05,
37
+ "loss": 4.8734,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.01475796930342385,
42
+ "grad_norm": 111.0,
43
+ "learning_rate": 2.88e-05,
44
+ "loss": 4.5143,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.01475796930342385,
49
+ "eval_loss": 4.148748874664307,
50
+ "eval_runtime": 22.5903,
51
+ "eval_samples_per_second": 88.534,
52
+ "eval_steps_per_second": 88.534,
53
+ "step": 25
54
+ },
55
+ {
56
+ "epoch": 0.01770956316410862,
57
+ "grad_norm": 234.0,
58
+ "learning_rate": 3.48e-05,
59
+ "loss": 4.0827,
60
+ "step": 30
61
+ },
62
+ {
63
+ "epoch": 0.02066115702479339,
64
+ "grad_norm": 173.0,
65
+ "learning_rate": 4.08e-05,
66
+ "loss": 3.624,
67
+ "step": 35
68
+ },
69
+ {
70
+ "epoch": 0.023612750885478158,
71
+ "grad_norm": 91.0,
72
+ "learning_rate": 4.6800000000000006e-05,
73
+ "loss": 2.6036,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 0.026564344746162927,
78
+ "grad_norm": 163.0,
79
+ "learning_rate": 5.28e-05,
80
+ "loss": 1.9856,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.0295159386068477,
85
+ "grad_norm": 65.0,
86
+ "learning_rate": 5.88e-05,
87
+ "loss": 1.1888,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 0.0295159386068477,
92
+ "eval_loss": 0.8685765862464905,
93
+ "eval_runtime": 22.0353,
94
+ "eval_samples_per_second": 90.764,
95
+ "eval_steps_per_second": 90.764,
96
+ "step": 50
97
+ },
98
+ {
99
+ "epoch": 0.032467532467532464,
100
+ "grad_norm": 31.125,
101
+ "learning_rate": 5.9999123594193744e-05,
102
+ "loss": 0.7088,
103
+ "step": 55
104
+ },
105
+ {
106
+ "epoch": 0.03541912632821724,
107
+ "grad_norm": 25.625,
108
+ "learning_rate": 5.9995563283365586e-05,
109
+ "loss": 0.4428,
110
+ "step": 60
111
+ },
112
+ {
113
+ "epoch": 0.03837072018890201,
114
+ "grad_norm": 38.25,
115
+ "learning_rate": 5.998926461693058e-05,
116
+ "loss": 0.7334,
117
+ "step": 65
118
+ },
119
+ {
120
+ "epoch": 0.04132231404958678,
121
+ "grad_norm": 13.375,
122
+ "learning_rate": 5.9980228169906714e-05,
123
+ "loss": 0.6483,
124
+ "step": 70
125
+ },
126
+ {
127
+ "epoch": 0.04427390791027155,
128
+ "grad_norm": 41.5,
129
+ "learning_rate": 5.9968454767249506e-05,
130
+ "loss": 0.5933,
131
+ "step": 75
132
+ },
133
+ {
134
+ "epoch": 0.04427390791027155,
135
+ "eval_loss": 0.6194470524787903,
136
+ "eval_runtime": 22.0585,
137
+ "eval_samples_per_second": 90.668,
138
+ "eval_steps_per_second": 90.668,
139
+ "step": 75
140
+ },
141
+ {
142
+ "epoch": 0.047225501770956316,
143
+ "grad_norm": 26.0,
144
+ "learning_rate": 5.995394548377669e-05,
145
+ "loss": 0.7882,
146
+ "step": 80
147
+ },
148
+ {
149
+ "epoch": 0.050177095631641085,
150
+ "grad_norm": 17.875,
151
+ "learning_rate": 5.993670164407008e-05,
152
+ "loss": 0.6423,
153
+ "step": 85
154
+ },
155
+ {
156
+ "epoch": 0.053128689492325853,
157
+ "grad_norm": 19.75,
158
+ "learning_rate": 5.991672482235466e-05,
159
+ "loss": 0.6952,
160
+ "step": 90
161
+ },
162
+ {
163
+ "epoch": 0.05608028335301062,
164
+ "grad_norm": 19.875,
165
+ "learning_rate": 5.9894016842354855e-05,
166
+ "loss": 0.6287,
167
+ "step": 95
168
+ },
169
+ {
170
+ "epoch": 0.0590318772136954,
171
+ "grad_norm": 19.75,
172
+ "learning_rate": 5.986857977712809e-05,
173
+ "loss": 0.6606,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.0590318772136954,
178
+ "eval_loss": 0.5243311524391174,
179
+ "eval_runtime": 22.0497,
180
+ "eval_samples_per_second": 90.704,
181
+ "eval_steps_per_second": 90.704,
182
+ "step": 100
183
+ },
184
+ {
185
+ "epoch": 0.06198347107438017,
186
+ "grad_norm": 26.25,
187
+ "learning_rate": 5.9840415948875444e-05,
188
+ "loss": 0.6193,
189
+ "step": 105
190
+ },
191
+ {
192
+ "epoch": 0.06493506493506493,
193
+ "grad_norm": 9.9375,
194
+ "learning_rate": 5.980952792872975e-05,
195
+ "loss": 0.5167,
196
+ "step": 110
197
+ },
198
+ {
199
+ "epoch": 0.0678866587957497,
200
+ "grad_norm": 35.0,
201
+ "learning_rate": 5.9775918536520786e-05,
202
+ "loss": 0.5922,
203
+ "step": 115
204
+ },
205
+ {
206
+ "epoch": 0.07083825265643448,
207
+ "grad_norm": 35.25,
208
+ "learning_rate": 5.973959084051791e-05,
209
+ "loss": 0.7114,
210
+ "step": 120
211
+ },
212
+ {
213
+ "epoch": 0.07378984651711924,
214
+ "grad_norm": 8.5625,
215
+ "learning_rate": 5.970054815714995e-05,
216
+ "loss": 0.6309,
217
+ "step": 125
218
+ },
219
+ {
220
+ "epoch": 0.07378984651711924,
221
+ "eval_loss": 0.644968569278717,
222
+ "eval_runtime": 22.0752,
223
+ "eval_samples_per_second": 90.6,
224
+ "eval_steps_per_second": 90.6,
225
+ "step": 125
226
+ },
227
+ {
228
+ "epoch": 0.07674144037780402,
229
+ "grad_norm": 19.375,
230
+ "learning_rate": 5.965879405070235e-05,
231
+ "loss": 0.622,
232
+ "step": 130
233
+ },
234
+ {
235
+ "epoch": 0.07969303423848878,
236
+ "grad_norm": 15.4375,
237
+ "learning_rate": 5.961433233299193e-05,
238
+ "loss": 0.5902,
239
+ "step": 135
240
+ },
241
+ {
242
+ "epoch": 0.08264462809917356,
243
+ "grad_norm": 19.375,
244
+ "learning_rate": 5.956716706301877e-05,
245
+ "loss": 0.6647,
246
+ "step": 140
247
+ },
248
+ {
249
+ "epoch": 0.08559622195985832,
250
+ "grad_norm": 11.0,
251
+ "learning_rate": 5.951730254659569e-05,
252
+ "loss": 0.7817,
253
+ "step": 145
254
+ },
255
+ {
256
+ "epoch": 0.0885478158205431,
257
+ "grad_norm": 16.875,
258
+ "learning_rate": 5.946474333595521e-05,
259
+ "loss": 0.6154,
260
+ "step": 150
261
+ },
262
+ {
263
+ "epoch": 0.0885478158205431,
264
+ "eval_loss": 0.5631881356239319,
265
+ "eval_runtime": 22.058,
266
+ "eval_samples_per_second": 90.67,
267
+ "eval_steps_per_second": 90.67,
268
+ "step": 150
269
+ },
270
+ {
271
+ "epoch": 0.09149940968122787,
272
+ "grad_norm": 17.625,
273
+ "learning_rate": 5.9409494229333904e-05,
274
+ "loss": 0.6532,
275
+ "step": 155
276
+ },
277
+ {
278
+ "epoch": 0.09445100354191263,
279
+ "grad_norm": 22.25,
280
+ "learning_rate": 5.935156027053442e-05,
281
+ "loss": 0.6099,
282
+ "step": 160
283
+ },
284
+ {
285
+ "epoch": 0.09740259740259741,
286
+ "grad_norm": 25.5,
287
+ "learning_rate": 5.929094674846495e-05,
288
+ "loss": 0.7848,
289
+ "step": 165
290
+ },
291
+ {
292
+ "epoch": 0.10035419126328217,
293
+ "grad_norm": 22.0,
294
+ "learning_rate": 5.922765919665644e-05,
295
+ "loss": 0.5597,
296
+ "step": 170
297
+ },
298
+ {
299
+ "epoch": 0.10330578512396695,
300
+ "grad_norm": 11.9375,
301
+ "learning_rate": 5.916170339275745e-05,
302
+ "loss": 0.6166,
303
+ "step": 175
304
+ },
305
+ {
306
+ "epoch": 0.10330578512396695,
307
+ "eval_loss": 0.5547081828117371,
308
+ "eval_runtime": 22.0339,
309
+ "eval_samples_per_second": 90.769,
310
+ "eval_steps_per_second": 90.769,
311
+ "step": 175
312
+ },
313
+ {
314
+ "epoch": 0.10625737898465171,
315
+ "grad_norm": 12.625,
316
+ "learning_rate": 5.909308535800664e-05,
317
+ "loss": 0.5548,
318
+ "step": 180
319
+ },
320
+ {
321
+ "epoch": 0.10920897284533648,
322
+ "grad_norm": 32.5,
323
+ "learning_rate": 5.90218113566831e-05,
324
+ "loss": 0.5793,
325
+ "step": 185
326
+ },
327
+ {
328
+ "epoch": 0.11216056670602124,
329
+ "grad_norm": 19.0,
330
+ "learning_rate": 5.8947887895534504e-05,
331
+ "loss": 0.8419,
332
+ "step": 190
333
+ },
334
+ {
335
+ "epoch": 0.11511216056670602,
336
+ "grad_norm": 12.5625,
337
+ "learning_rate": 5.8871321723183046e-05,
338
+ "loss": 0.6571,
339
+ "step": 195
340
+ },
341
+ {
342
+ "epoch": 0.1180637544273908,
343
+ "grad_norm": 18.375,
344
+ "learning_rate": 5.879211982950937e-05,
345
+ "loss": 0.6579,
346
+ "step": 200
347
+ },
348
+ {
349
+ "epoch": 0.1180637544273908,
350
+ "eval_loss": 0.5850950479507446,
351
+ "eval_runtime": 22.079,
352
+ "eval_samples_per_second": 90.584,
353
+ "eval_steps_per_second": 90.584,
354
+ "step": 200
355
+ },
356
+ {
357
+ "epoch": 0.12101534828807556,
358
+ "grad_norm": 20.0,
359
+ "learning_rate": 5.871028944501446e-05,
360
+ "loss": 0.5835,
361
+ "step": 205
362
+ },
363
+ {
364
+ "epoch": 0.12396694214876033,
365
+ "grad_norm": 20.25,
366
+ "learning_rate": 5.862583804015953e-05,
367
+ "loss": 0.5418,
368
+ "step": 210
369
+ },
370
+ {
371
+ "epoch": 0.1269185360094451,
372
+ "grad_norm": 27.375,
373
+ "learning_rate": 5.853877332468404e-05,
374
+ "loss": 0.6755,
375
+ "step": 215
376
+ },
377
+ {
378
+ "epoch": 0.12987012987012986,
379
+ "grad_norm": 29.5,
380
+ "learning_rate": 5.844910324690187e-05,
381
+ "loss": 0.6172,
382
+ "step": 220
383
+ },
384
+ {
385
+ "epoch": 0.13282172373081463,
386
+ "grad_norm": 25.375,
387
+ "learning_rate": 5.835683599297568e-05,
388
+ "loss": 0.5844,
389
+ "step": 225
390
+ },
391
+ {
392
+ "epoch": 0.13282172373081463,
393
+ "eval_loss": 0.5579066276550293,
394
+ "eval_runtime": 22.0218,
395
+ "eval_samples_per_second": 90.819,
396
+ "eval_steps_per_second": 90.819,
397
+ "step": 225
398
+ },
399
+ {
400
+ "epoch": 0.1357733175914994,
401
+ "grad_norm": 16.5,
402
+ "learning_rate": 5.8261979986169596e-05,
403
+ "loss": 0.6147,
404
+ "step": 230
405
+ },
406
+ {
407
+ "epoch": 0.13872491145218419,
408
+ "grad_norm": 10.6875,
409
+ "learning_rate": 5.816454388608023e-05,
410
+ "loss": 0.6352,
411
+ "step": 235
412
+ },
413
+ {
414
+ "epoch": 0.14167650531286896,
415
+ "grad_norm": 16.125,
416
+ "learning_rate": 5.8064536587846115e-05,
417
+ "loss": 0.7107,
418
+ "step": 240
419
+ },
420
+ {
421
+ "epoch": 0.1446280991735537,
422
+ "grad_norm": 23.5,
423
+ "learning_rate": 5.7961967221335674e-05,
424
+ "loss": 0.6752,
425
+ "step": 245
426
+ },
427
+ {
428
+ "epoch": 0.14757969303423848,
429
+ "grad_norm": 27.625,
430
+ "learning_rate": 5.7856845150313716e-05,
431
+ "loss": 0.6039,
432
+ "step": 250
433
+ },
434
+ {
435
+ "epoch": 0.14757969303423848,
436
+ "eval_loss": 0.5708025693893433,
437
+ "eval_runtime": 22.0736,
438
+ "eval_samples_per_second": 90.606,
439
+ "eval_steps_per_second": 90.606,
440
+ "step": 250
441
+ },
442
+ {
443
+ "epoch": 0.15053128689492326,
444
+ "grad_norm": 19.375,
445
+ "learning_rate": 5.7749179971586596e-05,
446
+ "loss": 0.6029,
447
+ "step": 255
448
+ },
449
+ {
450
+ "epoch": 0.15348288075560804,
451
+ "grad_norm": 24.75,
452
+ "learning_rate": 5.763898151412613e-05,
453
+ "loss": 0.4986,
454
+ "step": 260
455
+ },
456
+ {
457
+ "epoch": 0.15643447461629278,
458
+ "grad_norm": 36.0,
459
+ "learning_rate": 5.752625983817225e-05,
460
+ "loss": 0.6049,
461
+ "step": 265
462
+ },
463
+ {
464
+ "epoch": 0.15938606847697756,
465
+ "grad_norm": 22.25,
466
+ "learning_rate": 5.7411025234314634e-05,
467
+ "loss": 0.5895,
468
+ "step": 270
469
+ },
470
+ {
471
+ "epoch": 0.16233766233766234,
472
+ "grad_norm": 13.0,
473
+ "learning_rate": 5.729328822255319e-05,
474
+ "loss": 0.6957,
475
+ "step": 275
476
+ },
477
+ {
478
+ "epoch": 0.16233766233766234,
479
+ "eval_loss": 0.5568270087242126,
480
+ "eval_runtime": 22.052,
481
+ "eval_samples_per_second": 90.695,
482
+ "eval_steps_per_second": 90.695,
483
+ "step": 275
484
+ },
485
+ {
486
+ "epoch": 0.1652892561983471,
487
+ "grad_norm": 27.75,
488
+ "learning_rate": 5.717305955133773e-05,
489
+ "loss": 0.7125,
490
+ "step": 280
491
+ },
492
+ {
493
+ "epoch": 0.1682408500590319,
494
+ "grad_norm": 12.8125,
495
+ "learning_rate": 5.7050350196586686e-05,
496
+ "loss": 0.5977,
497
+ "step": 285
498
+ },
499
+ {
500
+ "epoch": 0.17119244391971664,
501
+ "grad_norm": 18.75,
502
+ "learning_rate": 5.692517136068511e-05,
503
+ "loss": 0.5908,
504
+ "step": 290
505
+ },
506
+ {
507
+ "epoch": 0.1741440377804014,
508
+ "grad_norm": 24.25,
509
+ "learning_rate": 5.679753447146195e-05,
510
+ "loss": 0.5334,
511
+ "step": 295
512
+ },
513
+ {
514
+ "epoch": 0.1770956316410862,
515
+ "grad_norm": 11.625,
516
+ "learning_rate": 5.666745118114688e-05,
517
+ "loss": 0.4347,
518
+ "step": 300
519
+ },
520
+ {
521
+ "epoch": 0.1770956316410862,
522
+ "eval_loss": 0.5026609301567078,
523
+ "eval_runtime": 22.0534,
524
+ "eval_samples_per_second": 90.689,
525
+ "eval_steps_per_second": 90.689,
526
+ "step": 300
527
+ },
528
+ {
529
+ "epoch": 0.18004722550177096,
530
+ "grad_norm": 24.75,
531
+ "learning_rate": 5.6534933365306394e-05,
532
+ "loss": 0.5473,
533
+ "step": 305
534
+ },
535
+ {
536
+ "epoch": 0.18299881936245574,
537
+ "grad_norm": 27.75,
538
+ "learning_rate": 5.6399993121759797e-05,
539
+ "loss": 0.5315,
540
+ "step": 310
541
+ },
542
+ {
543
+ "epoch": 0.1859504132231405,
544
+ "grad_norm": 11.3125,
545
+ "learning_rate": 5.626264276947469e-05,
546
+ "loss": 0.4026,
547
+ "step": 315
548
+ },
549
+ {
550
+ "epoch": 0.18890200708382526,
551
+ "grad_norm": 27.125,
552
+ "learning_rate": 5.612289484744238e-05,
553
+ "loss": 0.55,
554
+ "step": 320
555
+ },
556
+ {
557
+ "epoch": 0.19185360094451004,
558
+ "grad_norm": 29.375,
559
+ "learning_rate": 5.5980762113533166e-05,
560
+ "loss": 0.5988,
561
+ "step": 325
562
+ },
563
+ {
564
+ "epoch": 0.19185360094451004,
565
+ "eval_loss": 0.5229803919792175,
566
+ "eval_runtime": 22.0483,
567
+ "eval_samples_per_second": 90.71,
568
+ "eval_steps_per_second": 90.71,
569
+ "step": 325
570
+ },
571
+ {
572
+ "epoch": 0.19480519480519481,
573
+ "grad_norm": 18.0,
574
+ "learning_rate": 5.5836257543331644e-05,
575
+ "loss": 0.5174,
576
+ "step": 330
577
+ },
578
+ {
579
+ "epoch": 0.19775678866587956,
580
+ "grad_norm": 17.75,
581
+ "learning_rate": 5.568939432895213e-05,
582
+ "loss": 0.6662,
583
+ "step": 335
584
+ },
585
+ {
586
+ "epoch": 0.20070838252656434,
587
+ "grad_norm": 13.4375,
588
+ "learning_rate": 5.554018587783435e-05,
589
+ "loss": 0.6594,
590
+ "step": 340
591
+ },
592
+ {
593
+ "epoch": 0.20365997638724911,
594
+ "grad_norm": 20.125,
595
+ "learning_rate": 5.538864581151943e-05,
596
+ "loss": 0.5776,
597
+ "step": 345
598
+ },
599
+ {
600
+ "epoch": 0.2066115702479339,
601
+ "grad_norm": 29.0,
602
+ "learning_rate": 5.523478796440633e-05,
603
+ "loss": 0.6647,
604
+ "step": 350
605
+ },
606
+ {
607
+ "epoch": 0.2066115702479339,
608
+ "eval_loss": 0.5277854204177856,
609
+ "eval_runtime": 22.0408,
610
+ "eval_samples_per_second": 90.741,
611
+ "eval_steps_per_second": 90.741,
612
+ "step": 350
613
+ },
614
+ {
615
+ "epoch": 0.20956316410861867,
616
+ "grad_norm": 17.625,
617
+ "learning_rate": 5.507862638248896e-05,
618
+ "loss": 0.5446,
619
+ "step": 355
620
+ },
621
+ {
622
+ "epoch": 0.21251475796930341,
623
+ "grad_norm": 15.9375,
624
+ "learning_rate": 5.49201753220738e-05,
625
+ "loss": 0.5346,
626
+ "step": 360
627
+ },
628
+ {
629
+ "epoch": 0.2154663518299882,
630
+ "grad_norm": 13.8125,
631
+ "learning_rate": 5.475944924847845e-05,
632
+ "loss": 0.4782,
633
+ "step": 365
634
+ },
635
+ {
636
+ "epoch": 0.21841794569067297,
637
+ "grad_norm": 46.0,
638
+ "learning_rate": 5.459646283471106e-05,
639
+ "loss": 0.6363,
640
+ "step": 370
641
+ },
642
+ {
643
+ "epoch": 0.22136953955135774,
644
+ "grad_norm": 12.625,
645
+ "learning_rate": 5.443123096013083e-05,
646
+ "loss": 0.5603,
647
+ "step": 375
648
+ },
649
+ {
650
+ "epoch": 0.22136953955135774,
651
+ "eval_loss": 0.49818602204322815,
652
+ "eval_runtime": 22.0724,
653
+ "eval_samples_per_second": 90.611,
654
+ "eval_steps_per_second": 90.611,
655
+ "step": 375
656
+ },
657
+ {
658
+ "epoch": 0.2243211334120425,
659
+ "grad_norm": 25.125,
660
+ "learning_rate": 5.426376870908959e-05,
661
+ "loss": 0.6536,
662
+ "step": 380
663
+ },
664
+ {
665
+ "epoch": 0.22727272727272727,
666
+ "grad_norm": 17.25,
667
+ "learning_rate": 5.409409136955476e-05,
668
+ "loss": 0.5464,
669
+ "step": 385
670
+ },
671
+ {
672
+ "epoch": 0.23022432113341204,
673
+ "grad_norm": 23.375,
674
+ "learning_rate": 5.3922214431713654e-05,
675
+ "loss": 0.5587,
676
+ "step": 390
677
+ },
678
+ {
679
+ "epoch": 0.23317591499409682,
680
+ "grad_norm": 10.75,
681
+ "learning_rate": 5.3748153586559385e-05,
682
+ "loss": 0.5231,
683
+ "step": 395
684
+ },
685
+ {
686
+ "epoch": 0.2361275088547816,
687
+ "grad_norm": 19.875,
688
+ "learning_rate": 5.357192472445835e-05,
689
+ "loss": 0.503,
690
+ "step": 400
691
+ },
692
+ {
693
+ "epoch": 0.2361275088547816,
694
+ "eval_loss": 0.4984550178050995,
695
+ "eval_runtime": 22.074,
696
+ "eval_samples_per_second": 90.604,
697
+ "eval_steps_per_second": 90.604,
698
+ "step": 400
699
+ },
700
+ {
701
+ "epoch": 0.23907910271546634,
702
+ "grad_norm": 20.125,
703
+ "learning_rate": 5.339354393369962e-05,
704
+ "loss": 0.4524,
705
+ "step": 405
706
+ },
707
+ {
708
+ "epoch": 0.24203069657615112,
709
+ "grad_norm": 11.375,
710
+ "learning_rate": 5.321302749902615e-05,
711
+ "loss": 0.492,
712
+ "step": 410
713
+ },
714
+ {
715
+ "epoch": 0.2449822904368359,
716
+ "grad_norm": 20.75,
717
+ "learning_rate": 5.303039190014818e-05,
718
+ "loss": 0.4989,
719
+ "step": 415
720
+ },
721
+ {
722
+ "epoch": 0.24793388429752067,
723
+ "grad_norm": 14.0,
724
+ "learning_rate": 5.284565381023873e-05,
725
+ "loss": 0.6195,
726
+ "step": 420
727
+ },
728
+ {
729
+ "epoch": 0.25088547815820544,
730
+ "grad_norm": 25.125,
731
+ "learning_rate": 5.265883009441147e-05,
732
+ "loss": 0.5687,
733
+ "step": 425
734
+ },
735
+ {
736
+ "epoch": 0.25088547815820544,
737
+ "eval_loss": 0.4847257137298584,
738
+ "eval_runtime": 22.029,
739
+ "eval_samples_per_second": 90.789,
740
+ "eval_steps_per_second": 90.789,
741
+ "step": 425
742
+ },
743
+ {
744
+ "epoch": 0.2538370720188902,
745
+ "grad_norm": 16.75,
746
+ "learning_rate": 5.2469937808181055e-05,
747
+ "loss": 0.4048,
748
+ "step": 430
749
+ },
750
+ {
751
+ "epoch": 0.256788665879575,
752
+ "grad_norm": 39.75,
753
+ "learning_rate": 5.227899419590614e-05,
754
+ "loss": 0.5483,
755
+ "step": 435
756
+ },
757
+ {
758
+ "epoch": 0.2597402597402597,
759
+ "grad_norm": 15.125,
760
+ "learning_rate": 5.208601668921508e-05,
761
+ "loss": 0.4843,
762
+ "step": 440
763
+ },
764
+ {
765
+ "epoch": 0.2626918536009445,
766
+ "grad_norm": 15.5,
767
+ "learning_rate": 5.1891022905414546e-05,
768
+ "loss": 0.6146,
769
+ "step": 445
770
+ },
771
+ {
772
+ "epoch": 0.26564344746162927,
773
+ "grad_norm": 22.875,
774
+ "learning_rate": 5.169403064588125e-05,
775
+ "loss": 0.4279,
776
+ "step": 450
777
+ },
778
+ {
779
+ "epoch": 0.26564344746162927,
780
+ "eval_loss": 0.48101305961608887,
781
+ "eval_runtime": 22.0566,
782
+ "eval_samples_per_second": 90.676,
783
+ "eval_steps_per_second": 90.676,
784
+ "step": 450
785
+ },
786
+ {
787
+ "epoch": 0.26859504132231404,
788
+ "grad_norm": 28.625,
789
+ "learning_rate": 5.1495057894436757e-05,
790
+ "loss": 0.5749,
791
+ "step": 455
792
+ },
793
+ {
794
+ "epoch": 0.2715466351829988,
795
+ "grad_norm": 20.5,
796
+ "learning_rate": 5.1294122815705773e-05,
797
+ "loss": 0.4963,
798
+ "step": 460
799
+ },
800
+ {
801
+ "epoch": 0.2744982290436836,
802
+ "grad_norm": 22.25,
803
+ "learning_rate": 5.109124375345781e-05,
804
+ "loss": 0.4213,
805
+ "step": 465
806
+ },
807
+ {
808
+ "epoch": 0.27744982290436837,
809
+ "grad_norm": 18.125,
810
+ "learning_rate": 5.0886439228932576e-05,
811
+ "loss": 0.5002,
812
+ "step": 470
813
+ },
814
+ {
815
+ "epoch": 0.28040141676505315,
816
+ "grad_norm": 34.0,
817
+ "learning_rate": 5.067972793914911e-05,
818
+ "loss": 0.5136,
819
+ "step": 475
820
+ },
821
+ {
822
+ "epoch": 0.28040141676505315,
823
+ "eval_loss": 0.5137303471565247,
824
+ "eval_runtime": 22.0698,
825
+ "eval_samples_per_second": 90.621,
826
+ "eval_steps_per_second": 90.621,
827
+ "step": 475
828
+ },
829
+ {
830
+ "epoch": 0.2833530106257379,
831
+ "grad_norm": 15.0,
832
+ "learning_rate": 5.047112875519892e-05,
833
+ "loss": 0.5162,
834
+ "step": 480
835
+ },
836
+ {
837
+ "epoch": 0.28630460448642264,
838
+ "grad_norm": 38.0,
839
+ "learning_rate": 5.02606607205232e-05,
840
+ "loss": 0.4831,
841
+ "step": 485
842
+ },
843
+ {
844
+ "epoch": 0.2892561983471074,
845
+ "grad_norm": 26.5,
846
+ "learning_rate": 5.004834304917425e-05,
847
+ "loss": 0.5147,
848
+ "step": 490
849
+ },
850
+ {
851
+ "epoch": 0.2922077922077922,
852
+ "grad_norm": 37.0,
853
+ "learning_rate": 4.983419512406151e-05,
854
+ "loss": 0.553,
855
+ "step": 495
856
+ },
857
+ {
858
+ "epoch": 0.29515938606847697,
859
+ "grad_norm": 23.625,
860
+ "learning_rate": 4.9618236495181936e-05,
861
+ "loss": 0.3999,
862
+ "step": 500
863
+ },
864
+ {
865
+ "epoch": 0.29515938606847697,
866
+ "eval_loss": 0.5061969757080078,
867
+ "eval_runtime": 22.0382,
868
+ "eval_samples_per_second": 90.752,
869
+ "eval_steps_per_second": 90.752,
870
+ "step": 500
871
+ },
872
+ {
873
+ "epoch": 0.29811097992916175,
874
+ "grad_norm": 13.75,
875
+ "learning_rate": 4.9400486877835325e-05,
876
+ "loss": 0.4205,
877
+ "step": 505
878
+ },
879
+ {
880
+ "epoch": 0.3010625737898465,
881
+ "grad_norm": 22.5,
882
+ "learning_rate": 4.91809661508244e-05,
883
+ "loss": 0.5904,
884
+ "step": 510
885
+ },
886
+ {
887
+ "epoch": 0.3040141676505313,
888
+ "grad_norm": 16.75,
889
+ "learning_rate": 4.895969435464009e-05,
890
+ "loss": 0.5749,
891
+ "step": 515
892
+ },
893
+ {
894
+ "epoch": 0.3069657615112161,
895
+ "grad_norm": 10.8125,
896
+ "learning_rate": 4.873669168963196e-05,
897
+ "loss": 0.5841,
898
+ "step": 520
899
+ },
900
+ {
901
+ "epoch": 0.30991735537190085,
902
+ "grad_norm": 14.25,
903
+ "learning_rate": 4.851197851416409e-05,
904
+ "loss": 0.5454,
905
+ "step": 525
906
+ },
907
+ {
908
+ "epoch": 0.30991735537190085,
909
+ "eval_loss": 0.6045746207237244,
910
+ "eval_runtime": 22.0653,
911
+ "eval_samples_per_second": 90.64,
912
+ "eval_steps_per_second": 90.64,
913
+ "step": 525
914
+ },
915
+ {
916
+ "epoch": 0.31286894923258557,
917
+ "grad_norm": 12.4375,
918
+ "learning_rate": 4.828557534275651e-05,
919
+ "loss": 0.4654,
920
+ "step": 530
921
+ },
922
+ {
923
+ "epoch": 0.31582054309327035,
924
+ "grad_norm": 28.125,
925
+ "learning_rate": 4.8057502844212406e-05,
926
+ "loss": 0.5669,
927
+ "step": 535
928
+ },
929
+ {
930
+ "epoch": 0.3187721369539551,
931
+ "grad_norm": 17.625,
932
+ "learning_rate": 4.78277818397312e-05,
933
+ "loss": 0.3907,
934
+ "step": 540
935
+ },
936
+ {
937
+ "epoch": 0.3217237308146399,
938
+ "grad_norm": 12.625,
939
+ "learning_rate": 4.7596433301007775e-05,
940
+ "loss": 0.508,
941
+ "step": 545
942
+ },
943
+ {
944
+ "epoch": 0.3246753246753247,
945
+ "grad_norm": 26.75,
946
+ "learning_rate": 4.736347834831789e-05,
947
+ "loss": 0.408,
948
+ "step": 550
949
+ },
950
+ {
951
+ "epoch": 0.3246753246753247,
952
+ "eval_loss": 0.48477041721343994,
953
+ "eval_runtime": 22.0413,
954
+ "eval_samples_per_second": 90.739,
955
+ "eval_steps_per_second": 90.739,
956
+ "step": 550
957
+ },
958
+ {
959
+ "epoch": 0.32762691853600945,
960
+ "grad_norm": 32.25,
961
+ "learning_rate": 4.712893824859008e-05,
962
+ "loss": 0.3498,
963
+ "step": 555
964
+ },
965
+ {
966
+ "epoch": 0.3305785123966942,
967
+ "grad_norm": 21.25,
968
+ "learning_rate": 4.6892834413464163e-05,
969
+ "loss": 0.6738,
970
+ "step": 560
971
+ },
972
+ {
973
+ "epoch": 0.333530106257379,
974
+ "grad_norm": 23.0,
975
+ "learning_rate": 4.6655188397336515e-05,
976
+ "loss": 0.4393,
977
+ "step": 565
978
+ },
979
+ {
980
+ "epoch": 0.3364817001180638,
981
+ "grad_norm": 13.625,
982
+ "learning_rate": 4.641602189539235e-05,
983
+ "loss": 0.5021,
984
+ "step": 570
985
+ },
986
+ {
987
+ "epoch": 0.33943329397874855,
988
+ "grad_norm": 16.5,
989
+ "learning_rate": 4.617535674162509e-05,
990
+ "loss": 0.5409,
991
+ "step": 575
992
+ },
993
+ {
994
+ "epoch": 0.33943329397874855,
995
+ "eval_loss": 0.4802681505680084,
996
+ "eval_runtime": 22.041,
997
+ "eval_samples_per_second": 90.74,
998
+ "eval_steps_per_second": 90.74,
999
+ "step": 575
1000
+ },
1001
+ {
1002
+ "epoch": 0.34238488783943327,
1003
+ "grad_norm": 18.0,
1004
+ "learning_rate": 4.59332149068431e-05,
1005
+ "loss": 0.4619,
1006
+ "step": 580
1007
+ },
1008
+ {
1009
+ "epoch": 0.34533648170011805,
1010
+ "grad_norm": 19.25,
1011
+ "learning_rate": 4.5689618496664e-05,
1012
+ "loss": 0.5019,
1013
+ "step": 585
1014
+ },
1015
+ {
1016
+ "epoch": 0.3482880755608028,
1017
+ "grad_norm": 17.25,
1018
+ "learning_rate": 4.544458974949646e-05,
1019
+ "loss": 0.3654,
1020
+ "step": 590
1021
+ },
1022
+ {
1023
+ "epoch": 0.3512396694214876,
1024
+ "grad_norm": 25.625,
1025
+ "learning_rate": 4.519815103451012e-05,
1026
+ "loss": 0.6236,
1027
+ "step": 595
1028
+ },
1029
+ {
1030
+ "epoch": 0.3541912632821724,
1031
+ "grad_norm": 30.125,
1032
+ "learning_rate": 4.4950324849593455e-05,
1033
+ "loss": 0.4846,
1034
+ "step": 600
1035
+ },
1036
+ {
1037
+ "epoch": 0.3541912632821724,
1038
+ "eval_loss": 0.47454240918159485,
1039
+ "eval_runtime": 22.0763,
1040
+ "eval_samples_per_second": 90.595,
1041
+ "eval_steps_per_second": 90.595,
1042
+ "step": 600
1043
+ },
1044
+ {
1045
+ "epoch": 0.35714285714285715,
1046
+ "grad_norm": 17.75,
1047
+ "learning_rate": 4.470113381929984e-05,
1048
+ "loss": 0.5738,
1049
+ "step": 605
1050
+ },
1051
+ {
1052
+ "epoch": 0.3600944510035419,
1053
+ "grad_norm": 16.0,
1054
+ "learning_rate": 4.445060069278218e-05,
1055
+ "loss": 0.3647,
1056
+ "step": 610
1057
+ },
1058
+ {
1059
+ "epoch": 0.3630460448642267,
1060
+ "grad_norm": 12.3125,
1061
+ "learning_rate": 4.419874834171601e-05,
1062
+ "loss": 0.4772,
1063
+ "step": 615
1064
+ },
1065
+ {
1066
+ "epoch": 0.3659976387249115,
1067
+ "grad_norm": 12.4375,
1068
+ "learning_rate": 4.3945599758211594e-05,
1069
+ "loss": 0.4309,
1070
+ "step": 620
1071
+ },
1072
+ {
1073
+ "epoch": 0.3689492325855962,
1074
+ "grad_norm": 15.4375,
1075
+ "learning_rate": 4.369117805271482e-05,
1076
+ "loss": 0.3699,
1077
+ "step": 625
1078
+ },
1079
+ {
1080
+ "epoch": 0.3689492325855962,
1081
+ "eval_loss": 0.5046902298927307,
1082
+ "eval_runtime": 22.0518,
1083
+ "eval_samples_per_second": 90.696,
1084
+ "eval_steps_per_second": 90.696,
1085
+ "step": 625
1086
+ },
1087
+ {
1088
+ "epoch": 0.371900826446281,
1089
+ "grad_norm": 26.5,
1090
+ "learning_rate": 4.343550645189751e-05,
1091
+ "loss": 0.4449,
1092
+ "step": 630
1093
+ },
1094
+ {
1095
+ "epoch": 0.37485242030696575,
1096
+ "grad_norm": 21.125,
1097
+ "learning_rate": 4.317860829653692e-05,
1098
+ "loss": 0.2902,
1099
+ "step": 635
1100
+ },
1101
+ {
1102
+ "epoch": 0.3778040141676505,
1103
+ "grad_norm": 28.875,
1104
+ "learning_rate": 4.292050703938496e-05,
1105
+ "loss": 0.5582,
1106
+ "step": 640
1107
+ },
1108
+ {
1109
+ "epoch": 0.3807556080283353,
1110
+ "grad_norm": 13.75,
1111
+ "learning_rate": 4.266122624302714e-05,
1112
+ "loss": 0.5335,
1113
+ "step": 645
1114
+ },
1115
+ {
1116
+ "epoch": 0.3837072018890201,
1117
+ "grad_norm": 13.5,
1118
+ "learning_rate": 4.2400789577731485e-05,
1119
+ "loss": 0.3741,
1120
+ "step": 650
1121
+ },
1122
+ {
1123
+ "epoch": 0.3837072018890201,
1124
+ "eval_loss": 0.4519728720188141,
1125
+ "eval_runtime": 22.0605,
1126
+ "eval_samples_per_second": 90.66,
1127
+ "eval_steps_per_second": 90.66,
1128
+ "step": 650
1129
+ },
1130
+ {
1131
+ "epoch": 0.38665879574970485,
1132
+ "grad_norm": 19.5,
1133
+ "learning_rate": 4.213922081928763e-05,
1134
+ "loss": 0.5017,
1135
+ "step": 655
1136
+ },
1137
+ {
1138
+ "epoch": 0.38961038961038963,
1139
+ "grad_norm": 13.4375,
1140
+ "learning_rate": 4.187654384683628e-05,
1141
+ "loss": 0.359,
1142
+ "step": 660
1143
+ },
1144
+ {
1145
+ "epoch": 0.3925619834710744,
1146
+ "grad_norm": 11.1875,
1147
+ "learning_rate": 4.161278264068925e-05,
1148
+ "loss": 0.2609,
1149
+ "step": 665
1150
+ },
1151
+ {
1152
+ "epoch": 0.3955135773317591,
1153
+ "grad_norm": 13.0625,
1154
+ "learning_rate": 4.134796128014022e-05,
1155
+ "loss": 0.4038,
1156
+ "step": 670
1157
+ },
1158
+ {
1159
+ "epoch": 0.3984651711924439,
1160
+ "grad_norm": 17.625,
1161
+ "learning_rate": 4.108210394126652e-05,
1162
+ "loss": 0.3166,
1163
+ "step": 675
1164
+ },
1165
+ {
1166
+ "epoch": 0.3984651711924439,
1167
+ "eval_loss": 0.44906917214393616,
1168
+ "eval_runtime": 22.0309,
1169
+ "eval_samples_per_second": 90.782,
1170
+ "eval_steps_per_second": 90.782,
1171
+ "step": 675
1172
+ },
1173
+ {
1174
+ "epoch": 0.4014167650531287,
1175
+ "grad_norm": 32.25,
1176
+ "learning_rate": 4.0815234894722035e-05,
1177
+ "loss": 0.3372,
1178
+ "step": 680
1179
+ },
1180
+ {
1181
+ "epoch": 0.40436835891381345,
1182
+ "grad_norm": 27.25,
1183
+ "learning_rate": 4.05473785035215e-05,
1184
+ "loss": 0.3346,
1185
+ "step": 685
1186
+ },
1187
+ {
1188
+ "epoch": 0.40731995277449823,
1189
+ "grad_norm": 24.75,
1190
+ "learning_rate": 4.0278559220816304e-05,
1191
+ "loss": 0.3993,
1192
+ "step": 690
1193
+ },
1194
+ {
1195
+ "epoch": 0.410271546635183,
1196
+ "grad_norm": 20.0,
1197
+ "learning_rate": 4.0008801587662194e-05,
1198
+ "loss": 0.34,
1199
+ "step": 695
1200
+ },
1201
+ {
1202
+ "epoch": 0.4132231404958678,
1203
+ "grad_norm": 27.25,
1204
+ "learning_rate": 3.9738130230778796e-05,
1205
+ "loss": 0.4442,
1206
+ "step": 700
1207
+ },
1208
+ {
1209
+ "epoch": 0.4132231404958678,
1210
+ "eval_loss": 0.4622591435909271,
1211
+ "eval_runtime": 22.0728,
1212
+ "eval_samples_per_second": 90.609,
1213
+ "eval_steps_per_second": 90.609,
1214
+ "step": 700
1215
+ },
1216
+ {
1217
+ "epoch": 0.41617473435655256,
1218
+ "grad_norm": 22.625,
1219
+ "learning_rate": 3.946656986030142e-05,
1220
+ "loss": 0.3391,
1221
+ "step": 705
1222
+ },
1223
+ {
1224
+ "epoch": 0.41912632821723733,
1225
+ "grad_norm": 26.125,
1226
+ "learning_rate": 3.919414526752524e-05,
1227
+ "loss": 0.333,
1228
+ "step": 710
1229
+ },
1230
+ {
1231
+ "epoch": 0.42207792207792205,
1232
+ "grad_norm": 7.5,
1233
+ "learning_rate": 3.8920881322642036e-05,
1234
+ "loss": 0.3791,
1235
+ "step": 715
1236
+ },
1237
+ {
1238
+ "epoch": 0.42502951593860683,
1239
+ "grad_norm": 32.0,
1240
+ "learning_rate": 3.864680297246972e-05,
1241
+ "loss": 0.5102,
1242
+ "step": 720
1243
+ },
1244
+ {
1245
+ "epoch": 0.4279811097992916,
1246
+ "grad_norm": 15.5,
1247
+ "learning_rate": 3.8371935238174924e-05,
1248
+ "loss": 0.4723,
1249
+ "step": 725
1250
+ },
1251
+ {
1252
+ "epoch": 0.4279811097992916,
1253
+ "eval_loss": 0.44943633675575256,
1254
+ "eval_runtime": 22.0672,
1255
+ "eval_samples_per_second": 90.632,
1256
+ "eval_steps_per_second": 90.632,
1257
+ "step": 725
1258
+ },
1259
+ {
1260
+ "epoch": 0.4309327036599764,
1261
+ "grad_norm": 26.875,
1262
+ "learning_rate": 3.809630321298872e-05,
1263
+ "loss": 0.4499,
1264
+ "step": 730
1265
+ },
1266
+ {
1267
+ "epoch": 0.43388429752066116,
1268
+ "grad_norm": 14.3125,
1269
+ "learning_rate": 3.78199320599159e-05,
1270
+ "loss": 0.2574,
1271
+ "step": 735
1272
+ },
1273
+ {
1274
+ "epoch": 0.43683589138134593,
1275
+ "grad_norm": 10.0625,
1276
+ "learning_rate": 3.754284700943767e-05,
1277
+ "loss": 0.3981,
1278
+ "step": 740
1279
+ },
1280
+ {
1281
+ "epoch": 0.4397874852420307,
1282
+ "grad_norm": 20.5,
1283
+ "learning_rate": 3.726507335720842e-05,
1284
+ "loss": 0.3897,
1285
+ "step": 745
1286
+ },
1287
+ {
1288
+ "epoch": 0.4427390791027155,
1289
+ "grad_norm": 12.4375,
1290
+ "learning_rate": 3.6986636461746365e-05,
1291
+ "loss": 0.3282,
1292
+ "step": 750
1293
+ },
1294
+ {
1295
+ "epoch": 0.4427390791027155,
1296
+ "eval_loss": 0.46209147572517395,
1297
+ "eval_runtime": 22.0979,
1298
+ "eval_samples_per_second": 90.506,
1299
+ "eval_steps_per_second": 90.506,
1300
+ "step": 750
1301
+ },
1302
+ {
1303
+ "epoch": 0.44569067296340026,
1304
+ "grad_norm": 17.0,
1305
+ "learning_rate": 3.6707561742118546e-05,
1306
+ "loss": 0.3905,
1307
+ "step": 755
1308
+ },
1309
+ {
1310
+ "epoch": 0.448642266824085,
1311
+ "grad_norm": 16.875,
1312
+ "learning_rate": 3.642787467562024e-05,
1313
+ "loss": 0.2688,
1314
+ "step": 760
1315
+ },
1316
+ {
1317
+ "epoch": 0.45159386068476975,
1318
+ "grad_norm": 17.625,
1319
+ "learning_rate": 3.614760079544913e-05,
1320
+ "loss": 0.369,
1321
+ "step": 765
1322
+ },
1323
+ {
1324
+ "epoch": 0.45454545454545453,
1325
+ "grad_norm": 24.0,
1326
+ "learning_rate": 3.5866765688374296e-05,
1327
+ "loss": 0.4732,
1328
+ "step": 770
1329
+ },
1330
+ {
1331
+ "epoch": 0.4574970484061393,
1332
+ "grad_norm": 16.375,
1333
+ "learning_rate": 3.558539499240037e-05,
1334
+ "loss": 0.3928,
1335
+ "step": 775
1336
+ },
1337
+ {
1338
+ "epoch": 0.4574970484061393,
1339
+ "eval_loss": 0.4500948488712311,
1340
+ "eval_runtime": 22.071,
1341
+ "eval_samples_per_second": 90.617,
1342
+ "eval_steps_per_second": 90.617,
1343
+ "step": 775
1344
+ },
1345
+ {
1346
+ "epoch": 0.4604486422668241,
1347
+ "grad_norm": 11.375,
1348
+ "learning_rate": 3.530351439442696e-05,
1349
+ "loss": 0.498,
1350
+ "step": 780
1351
+ },
1352
+ {
1353
+ "epoch": 0.46340023612750886,
1354
+ "grad_norm": 9.125,
1355
+ "learning_rate": 3.502114962790366e-05,
1356
+ "loss": 0.3545,
1357
+ "step": 785
1358
+ },
1359
+ {
1360
+ "epoch": 0.46635182998819363,
1361
+ "grad_norm": 12.375,
1362
+ "learning_rate": 3.473832647048079e-05,
1363
+ "loss": 0.5654,
1364
+ "step": 790
1365
+ },
1366
+ {
1367
+ "epoch": 0.4693034238488784,
1368
+ "grad_norm": 10.9375,
1369
+ "learning_rate": 3.445507074165612e-05,
1370
+ "loss": 0.3995,
1371
+ "step": 795
1372
+ },
1373
+ {
1374
+ "epoch": 0.4722550177095632,
1375
+ "grad_norm": 8.375,
1376
+ "learning_rate": 3.41714083004177e-05,
1377
+ "loss": 0.4174,
1378
+ "step": 800
1379
+ },
1380
+ {
1381
+ "epoch": 0.4722550177095632,
1382
+ "eval_loss": 0.4428013265132904,
1383
+ "eval_runtime": 22.0412,
1384
+ "eval_samples_per_second": 90.739,
1385
+ "eval_steps_per_second": 90.739,
1386
+ "step": 800
1387
+ },
1388
+ {
1389
+ "epoch": 0.47520661157024796,
1390
+ "grad_norm": 14.3125,
1391
+ "learning_rate": 3.3887365042883226e-05,
1392
+ "loss": 0.487,
1393
+ "step": 805
1394
+ },
1395
+ {
1396
+ "epoch": 0.4781582054309327,
1397
+ "grad_norm": 12.4375,
1398
+ "learning_rate": 3.360296689993586e-05,
1399
+ "loss": 0.367,
1400
+ "step": 810
1401
+ },
1402
+ {
1403
+ "epoch": 0.48110979929161746,
1404
+ "grad_norm": 10.375,
1405
+ "learning_rate": 3.331823983485695e-05,
1406
+ "loss": 0.2972,
1407
+ "step": 815
1408
+ },
1409
+ {
1410
+ "epoch": 0.48406139315230223,
1411
+ "grad_norm": 14.0,
1412
+ "learning_rate": 3.303320984095584e-05,
1413
+ "loss": 0.417,
1414
+ "step": 820
1415
+ },
1416
+ {
1417
+ "epoch": 0.487012987012987,
1418
+ "grad_norm": 25.0,
1419
+ "learning_rate": 3.274790293919685e-05,
1420
+ "loss": 0.3814,
1421
+ "step": 825
1422
+ },
1423
+ {
1424
+ "epoch": 0.487012987012987,
1425
+ "eval_loss": 0.45124053955078125,
1426
+ "eval_runtime": 22.0625,
1427
+ "eval_samples_per_second": 90.652,
1428
+ "eval_steps_per_second": 90.652,
1429
+ "step": 825
1430
+ },
1431
+ {
1432
+ "epoch": 0.4899645808736718,
1433
+ "grad_norm": 19.125,
1434
+ "learning_rate": 3.246234517582378e-05,
1435
+ "loss": 0.4698,
1436
+ "step": 830
1437
+ },
1438
+ {
1439
+ "epoch": 0.49291617473435656,
1440
+ "grad_norm": 19.625,
1441
+ "learning_rate": 3.217656261998208e-05,
1442
+ "loss": 0.4826,
1443
+ "step": 835
1444
+ },
1445
+ {
1446
+ "epoch": 0.49586776859504134,
1447
+ "grad_norm": 9.25,
1448
+ "learning_rate": 3.189058136133898e-05,
1449
+ "loss": 0.4592,
1450
+ "step": 840
1451
+ },
1452
+ {
1453
+ "epoch": 0.4988193624557261,
1454
+ "grad_norm": 7.40625,
1455
+ "learning_rate": 3.1604427507701675e-05,
1456
+ "loss": 0.551,
1457
+ "step": 845
1458
+ },
1459
+ {
1460
+ "epoch": 0.5017709563164109,
1461
+ "grad_norm": 22.25,
1462
+ "learning_rate": 3.131812718263392e-05,
1463
+ "loss": 0.3845,
1464
+ "step": 850
1465
+ },
1466
+ {
1467
+ "epoch": 0.5017709563164109,
1468
+ "eval_loss": 0.42552411556243896,
1469
+ "eval_runtime": 22.0632,
1470
+ "eval_samples_per_second": 90.649,
1471
+ "eval_steps_per_second": 90.649,
1472
+ "step": 850
1473
+ },
1474
+ {
1475
+ "epoch": 0.5047225501770957,
1476
+ "grad_norm": 11.875,
1477
+ "learning_rate": 3.1031706523071115e-05,
1478
+ "loss": 0.5727,
1479
+ "step": 855
1480
+ },
1481
+ {
1482
+ "epoch": 0.5076741440377804,
1483
+ "grad_norm": 5.21875,
1484
+ "learning_rate": 3.0745191676934285e-05,
1485
+ "loss": 0.4759,
1486
+ "step": 860
1487
+ },
1488
+ {
1489
+ "epoch": 0.5106257378984652,
1490
+ "grad_norm": 20.75,
1491
+ "learning_rate": 3.0458608800742883e-05,
1492
+ "loss": 0.4727,
1493
+ "step": 865
1494
+ },
1495
+ {
1496
+ "epoch": 0.51357733175915,
1497
+ "grad_norm": 19.125,
1498
+ "learning_rate": 3.0171984057227008e-05,
1499
+ "loss": 0.4733,
1500
+ "step": 870
1501
+ },
1502
+ {
1503
+ "epoch": 0.5165289256198347,
1504
+ "grad_norm": 21.125,
1505
+ "learning_rate": 2.988534361293888e-05,
1506
+ "loss": 0.3789,
1507
+ "step": 875
1508
+ },
1509
+ {
1510
+ "epoch": 0.5165289256198347,
1511
+ "eval_loss": 0.41610679030418396,
1512
+ "eval_runtime": 22.0756,
1513
+ "eval_samples_per_second": 90.598,
1514
+ "eval_steps_per_second": 90.598,
1515
+ "step": 875
1516
+ },
1517
+ {
1518
+ "epoch": 0.5194805194805194,
1519
+ "grad_norm": 10.8125,
1520
+ "learning_rate": 2.959871363586411e-05,
1521
+ "loss": 0.4258,
1522
+ "step": 880
1523
+ },
1524
+ {
1525
+ "epoch": 0.5224321133412042,
1526
+ "grad_norm": 9.0625,
1527
+ "learning_rate": 2.9312120293032703e-05,
1528
+ "loss": 0.408,
1529
+ "step": 885
1530
+ },
1531
+ {
1532
+ "epoch": 0.525383707201889,
1533
+ "grad_norm": 29.375,
1534
+ "learning_rate": 2.902558974813026e-05,
1535
+ "loss": 0.5145,
1536
+ "step": 890
1537
+ },
1538
+ {
1539
+ "epoch": 0.5283353010625738,
1540
+ "grad_norm": 14.9375,
1541
+ "learning_rate": 2.873914815910944e-05,
1542
+ "loss": 0.4281,
1543
+ "step": 895
1544
+ },
1545
+ {
1546
+ "epoch": 0.5312868949232585,
1547
+ "grad_norm": 19.125,
1548
+ "learning_rate": 2.8452821675801944e-05,
1549
+ "loss": 0.3189,
1550
+ "step": 900
1551
+ },
1552
+ {
1553
+ "epoch": 0.5312868949232585,
1554
+ "eval_loss": 0.41652774810791016,
1555
+ "eval_runtime": 22.0235,
1556
+ "eval_samples_per_second": 90.812,
1557
+ "eval_steps_per_second": 90.812,
1558
+ "step": 900
1559
+ },
1560
+ {
1561
+ "epoch": 0.5342384887839433,
1562
+ "grad_norm": 19.375,
1563
+ "learning_rate": 2.81666364375312e-05,
1564
+ "loss": 0.4598,
1565
+ "step": 905
1566
+ },
1567
+ {
1568
+ "epoch": 0.5371900826446281,
1569
+ "grad_norm": 13.4375,
1570
+ "learning_rate": 2.7880618570726142e-05,
1571
+ "loss": 0.4811,
1572
+ "step": 910
1573
+ },
1574
+ {
1575
+ "epoch": 0.5401416765053129,
1576
+ "grad_norm": 23.0,
1577
+ "learning_rate": 2.7594794186535993e-05,
1578
+ "loss": 0.4931,
1579
+ "step": 915
1580
+ },
1581
+ {
1582
+ "epoch": 0.5430932703659976,
1583
+ "grad_norm": 24.125,
1584
+ "learning_rate": 2.7309189378446578e-05,
1585
+ "loss": 0.5955,
1586
+ "step": 920
1587
+ },
1588
+ {
1589
+ "epoch": 0.5460448642266824,
1590
+ "grad_norm": 19.875,
1591
+ "learning_rate": 2.702383021989817e-05,
1592
+ "loss": 0.3743,
1593
+ "step": 925
1594
+ },
1595
+ {
1596
+ "epoch": 0.5460448642266824,
1597
+ "eval_loss": 0.41767558455467224,
1598
+ "eval_runtime": 22.0719,
1599
+ "eval_samples_per_second": 90.613,
1600
+ "eval_steps_per_second": 90.613,
1601
+ "step": 925
1602
+ }
1603
+ ],
1604
+ "logging_steps": 5,
1605
+ "max_steps": 1694,
1606
+ "num_input_tokens_seen": 0,
1607
+ "num_train_epochs": 1,
1608
+ "save_steps": 25,
1609
+ "stateful_callbacks": {
1610
+ "TrainerControl": {
1611
+ "args": {
1612
+ "should_epoch_stop": false,
1613
+ "should_evaluate": false,
1614
+ "should_log": false,
1615
+ "should_save": true,
1616
+ "should_training_stop": false
1617
+ },
1618
+ "attributes": {}
1619
+ }
1620
+ },
1621
+ "total_flos": 1.3649613399392256e+16,
1622
+ "train_batch_size": 1,
1623
+ "trial_name": null,
1624
+ "trial_params": null
1625
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b9d3fb6a396b6f7fbd066e1f1e57ac46cbb2892debf89344c191c26dfa7a815
3
+ size 5777
vocab.json ADDED
The diff for this file is too large to render. See raw diff