Text Generation
PEFT
Safetensors
English
qlora
lora
structured-output
ShogoMu commited on
Commit
fca50ad
·
verified ·
1 Parent(s): e94f2cf

Upload LoRA adapter (README written by author)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Instruct-2507
3
+ datasets:
4
+ - u-10bei/structured_data_with_cot_dataset_512_v2
5
+ - daichira/structured-hard-sft-4k
6
+ language:
7
+ - en
8
+ license: apache-2.0
9
+ library_name: peft
10
+ pipeline_tag: text-generation
11
+ tags:
12
+ - qlora
13
+ - lora
14
+ - structured-output
15
+ ---
16
+
17
+ qwen3-4b-structured-output-lora-v5
18
+
19
+ This repository provides a **LoRA adapter** fine-tuned from
20
+ **Qwen/Qwen3-4B-Instruct-2507** using **QLoRA (4-bit, Unsloth)**.
21
+
22
+ This repository contains **LoRA adapter weights only**.
23
+ The base model must be loaded separately.
24
+
25
+ ## Training Objective
26
+
27
+ This adapter is trained to improve **structured output accuracy**
28
+ (JSON / YAML / XML / TOML / CSV).
29
+
30
+ Loss is applied only to the final assistant output,
31
+ while intermediate reasoning (Chain-of-Thought) is masked.
32
+
33
+ ## Training Configuration
34
+
35
+ - Base model: Qwen/Qwen3-4B-Instruct-2507
36
+ - Method: QLoRA (4-bit)
37
+ - Max sequence length: 1024
38
+ - Epochs: 4
39
+ - Learning rate: 1e-06
40
+ - LoRA: r=64, alpha=128
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ from transformers import AutoModelForCausalLM, AutoTokenizer
46
+ from peft import PeftModel
47
+ import torch
48
+
49
+ base = "Qwen/Qwen3-4B-Instruct-2507"
50
+ adapter = "your_id/your-repo"
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(base)
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ base,
55
+ torch_dtype=torch.float16,
56
+ device_map="auto",
57
+ )
58
+ model = PeftModel.from_pretrained(model, adapter)
59
+ ```
60
+
61
+ ## Sources & Terms (IMPORTANT)
62
+
63
+ Training data:
64
+ - u-10bei/structured_data_with_cot_dataset_512_v2
65
+ - daichira/structured-hard-sft-4k
66
+
67
+ Dataset License: MIT License. This dataset is used and distributed under the terms of the MIT License.
68
+ Compliance: Users must comply with the MIT license (including copyright notice) and the base model's original terms of use.
adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Qwen3ForCausalLM",
7
+ "parent_library": "transformers.models.qwen3.modeling_qwen3",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/qwen3-4b-instruct-2507-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 128,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 64,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "down_proj",
37
+ "up_proj",
38
+ "o_proj",
39
+ "q_proj",
40
+ "gate_proj",
41
+ "v_proj",
42
+ "k_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82760ce0d2511d9cc5cd7fe7e6c1b571bf1c8f3b3dfae80791046c9ec0fafa50
3
+ size 528550256
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|vision_pad|>",
236
+ "padding_side": "right",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
trainer_state.json ADDED
@@ -0,0 +1,1410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.5248070562293274,
6
+ "eval_steps": 50,
7
+ "global_step": 1600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.022050716648291068,
14
+ "grad_norm": 2.091796398162842,
15
+ "learning_rate": 4.9450549450549446e-08,
16
+ "loss": 1.3103,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.044101433296582136,
21
+ "grad_norm": 2.3156776428222656,
22
+ "learning_rate": 1.0439560439560439e-07,
23
+ "loss": 1.4355,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.06615214994487321,
28
+ "grad_norm": 1.8849185705184937,
29
+ "learning_rate": 1.5934065934065932e-07,
30
+ "loss": 1.3735,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.08820286659316427,
35
+ "grad_norm": 2.9574432373046875,
36
+ "learning_rate": 2.1428571428571426e-07,
37
+ "loss": 1.4364,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.11025358324145534,
42
+ "grad_norm": 2.344658136367798,
43
+ "learning_rate": 2.692307692307692e-07,
44
+ "loss": 1.516,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.11025358324145534,
49
+ "eval_loss": 1.4678384065628052,
50
+ "eval_runtime": 30.9931,
51
+ "eval_samples_per_second": 12.487,
52
+ "eval_steps_per_second": 6.259,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 0.13230429988974643,
57
+ "grad_norm": 1.638563632965088,
58
+ "learning_rate": 3.2417582417582416e-07,
59
+ "loss": 1.3533,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 0.1543550165380375,
64
+ "grad_norm": 2.478177547454834,
65
+ "learning_rate": 3.791208791208791e-07,
66
+ "loss": 1.3557,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 0.17640573318632854,
71
+ "grad_norm": 1.6685155630111694,
72
+ "learning_rate": 4.3406593406593404e-07,
73
+ "loss": 1.2135,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.19845644983461963,
78
+ "grad_norm": 1.918034553527832,
79
+ "learning_rate": 4.890109890109889e-07,
80
+ "loss": 1.4055,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.2205071664829107,
85
+ "grad_norm": 2.07231068611145,
86
+ "learning_rate": 5.439560439560439e-07,
87
+ "loss": 1.2006,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.2205071664829107,
92
+ "eval_loss": 1.4129432439804077,
93
+ "eval_runtime": 31.1241,
94
+ "eval_samples_per_second": 12.434,
95
+ "eval_steps_per_second": 6.233,
96
+ "step": 100
97
+ },
98
+ {
99
+ "epoch": 0.24255788313120177,
100
+ "grad_norm": 1.890555739402771,
101
+ "learning_rate": 5.989010989010988e-07,
102
+ "loss": 1.288,
103
+ "step": 110
104
+ },
105
+ {
106
+ "epoch": 0.26460859977949286,
107
+ "grad_norm": 1.5374747514724731,
108
+ "learning_rate": 6.538461538461538e-07,
109
+ "loss": 1.3875,
110
+ "step": 120
111
+ },
112
+ {
113
+ "epoch": 0.2866593164277839,
114
+ "grad_norm": 1.4329373836517334,
115
+ "learning_rate": 7.087912087912088e-07,
116
+ "loss": 1.1613,
117
+ "step": 130
118
+ },
119
+ {
120
+ "epoch": 0.308710033076075,
121
+ "grad_norm": 2.4291820526123047,
122
+ "learning_rate": 7.637362637362636e-07,
123
+ "loss": 1.2108,
124
+ "step": 140
125
+ },
126
+ {
127
+ "epoch": 0.33076074972436603,
128
+ "grad_norm": 2.4429850578308105,
129
+ "learning_rate": 8.186813186813187e-07,
130
+ "loss": 1.2541,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 0.33076074972436603,
135
+ "eval_loss": 1.2611055374145508,
136
+ "eval_runtime": 30.9842,
137
+ "eval_samples_per_second": 12.49,
138
+ "eval_steps_per_second": 6.261,
139
+ "step": 150
140
+ },
141
+ {
142
+ "epoch": 0.3528114663726571,
143
+ "grad_norm": 1.2373392581939697,
144
+ "learning_rate": 8.736263736263736e-07,
145
+ "loss": 1.0923,
146
+ "step": 160
147
+ },
148
+ {
149
+ "epoch": 0.3748621830209482,
150
+ "grad_norm": 1.0209314823150635,
151
+ "learning_rate": 9.285714285714285e-07,
152
+ "loss": 1.214,
153
+ "step": 170
154
+ },
155
+ {
156
+ "epoch": 0.39691289966923926,
157
+ "grad_norm": 0.8216220140457153,
158
+ "learning_rate": 9.835164835164834e-07,
159
+ "loss": 0.9525,
160
+ "step": 180
161
+ },
162
+ {
163
+ "epoch": 0.4189636163175303,
164
+ "grad_norm": 1.5757627487182617,
165
+ "learning_rate": 9.999547180444195e-07,
166
+ "loss": 0.9399,
167
+ "step": 190
168
+ },
169
+ {
170
+ "epoch": 0.4410143329658214,
171
+ "grad_norm": 1.2908568382263184,
172
+ "learning_rate": 9.997329486184973e-07,
173
+ "loss": 1.0129,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 0.4410143329658214,
178
+ "eval_loss": 1.1076371669769287,
179
+ "eval_runtime": 31.0875,
180
+ "eval_samples_per_second": 12.449,
181
+ "eval_steps_per_second": 6.24,
182
+ "step": 200
183
+ },
184
+ {
185
+ "epoch": 0.46306504961411243,
186
+ "grad_norm": 0.6605131030082703,
187
+ "learning_rate": 9.993264565013427e-07,
188
+ "loss": 0.8898,
189
+ "step": 210
190
+ },
191
+ {
192
+ "epoch": 0.48511576626240355,
193
+ "grad_norm": 0.5790239572525024,
194
+ "learning_rate": 9.98735391949844e-07,
195
+ "loss": 1.0172,
196
+ "step": 220
197
+ },
198
+ {
199
+ "epoch": 0.5071664829106945,
200
+ "grad_norm": 0.7017053961753845,
201
+ "learning_rate": 9.979599734467628e-07,
202
+ "loss": 0.9997,
203
+ "step": 230
204
+ },
205
+ {
206
+ "epoch": 0.5292171995589857,
207
+ "grad_norm": 0.5328222513198853,
208
+ "learning_rate": 9.97000487619973e-07,
209
+ "loss": 0.9148,
210
+ "step": 240
211
+ },
212
+ {
213
+ "epoch": 0.5512679162072768,
214
+ "grad_norm": 0.4753032326698303,
215
+ "learning_rate": 9.958572891365115e-07,
216
+ "loss": 0.9804,
217
+ "step": 250
218
+ },
219
+ {
220
+ "epoch": 0.5512679162072768,
221
+ "eval_loss": 1.0351777076721191,
222
+ "eval_runtime": 30.7039,
223
+ "eval_samples_per_second": 12.604,
224
+ "eval_steps_per_second": 6.318,
225
+ "step": 250
226
+ },
227
+ {
228
+ "epoch": 0.5733186328555678,
229
+ "grad_norm": 0.4930404722690582,
230
+ "learning_rate": 9.945308005714784e-07,
231
+ "loss": 0.9634,
232
+ "step": 260
233
+ },
234
+ {
235
+ "epoch": 0.5953693495038589,
236
+ "grad_norm": 0.48022523522377014,
237
+ "learning_rate": 9.93021512251833e-07,
238
+ "loss": 0.9131,
239
+ "step": 270
240
+ },
241
+ {
242
+ "epoch": 0.61742006615215,
243
+ "grad_norm": 1.1264078617095947,
244
+ "learning_rate": 9.913299820751512e-07,
245
+ "loss": 0.9378,
246
+ "step": 280
247
+ },
248
+ {
249
+ "epoch": 0.639470782800441,
250
+ "grad_norm": 0.5375543236732483,
251
+ "learning_rate": 9.894568353033998e-07,
252
+ "loss": 0.9795,
253
+ "step": 290
254
+ },
255
+ {
256
+ "epoch": 0.6615214994487321,
257
+ "grad_norm": 0.5027572512626648,
258
+ "learning_rate": 9.874027643318147e-07,
259
+ "loss": 1.0432,
260
+ "step": 300
261
+ },
262
+ {
263
+ "epoch": 0.6615214994487321,
264
+ "eval_loss": 0.9879136085510254,
265
+ "eval_runtime": 31.0824,
266
+ "eval_samples_per_second": 12.451,
267
+ "eval_steps_per_second": 6.241,
268
+ "step": 300
269
+ },
270
+ {
271
+ "epoch": 0.6835722160970231,
272
+ "grad_norm": 0.6522424221038818,
273
+ "learning_rate": 9.851685284329612e-07,
274
+ "loss": 0.9544,
275
+ "step": 310
276
+ },
277
+ {
278
+ "epoch": 0.7056229327453142,
279
+ "grad_norm": 0.4375791549682617,
280
+ "learning_rate": 9.827549534760743e-07,
281
+ "loss": 0.9337,
282
+ "step": 320
283
+ },
284
+ {
285
+ "epoch": 0.7276736493936052,
286
+ "grad_norm": 0.5359438061714172,
287
+ "learning_rate": 9.801629316217826e-07,
288
+ "loss": 0.9501,
289
+ "step": 330
290
+ },
291
+ {
292
+ "epoch": 0.7497243660418964,
293
+ "grad_norm": 0.5156540870666504,
294
+ "learning_rate": 9.773934209923273e-07,
295
+ "loss": 0.9362,
296
+ "step": 340
297
+ },
298
+ {
299
+ "epoch": 0.7717750826901875,
300
+ "grad_norm": 0.3920913636684418,
301
+ "learning_rate": 9.74447445317399e-07,
302
+ "loss": 0.9031,
303
+ "step": 350
304
+ },
305
+ {
306
+ "epoch": 0.7717750826901875,
307
+ "eval_loss": 0.9537778496742249,
308
+ "eval_runtime": 30.8673,
309
+ "eval_samples_per_second": 12.538,
310
+ "eval_steps_per_second": 6.285,
311
+ "step": 350
312
+ },
313
+ {
314
+ "epoch": 0.7938257993384785,
315
+ "grad_norm": 0.5934253334999084,
316
+ "learning_rate": 9.713260935557233e-07,
317
+ "loss": 0.8994,
318
+ "step": 360
319
+ },
320
+ {
321
+ "epoch": 0.8158765159867696,
322
+ "grad_norm": 0.5070703625679016,
323
+ "learning_rate": 9.680305194925356e-07,
324
+ "loss": 0.9918,
325
+ "step": 370
326
+ },
327
+ {
328
+ "epoch": 0.8379272326350606,
329
+ "grad_norm": 0.4203622043132782,
330
+ "learning_rate": 9.645619413130921e-07,
331
+ "loss": 0.9669,
332
+ "step": 380
333
+ },
334
+ {
335
+ "epoch": 0.8599779492833517,
336
+ "grad_norm": 0.4589724838733673,
337
+ "learning_rate": 9.60921641152377e-07,
338
+ "loss": 0.8177,
339
+ "step": 390
340
+ },
341
+ {
342
+ "epoch": 0.8820286659316428,
343
+ "grad_norm": 0.4475689232349396,
344
+ "learning_rate": 9.571109646211698e-07,
345
+ "loss": 0.9922,
346
+ "step": 400
347
+ },
348
+ {
349
+ "epoch": 0.8820286659316428,
350
+ "eval_loss": 0.9277470707893372,
351
+ "eval_runtime": 30.9395,
352
+ "eval_samples_per_second": 12.508,
353
+ "eval_steps_per_second": 6.27,
354
+ "step": 400
355
+ },
356
+ {
357
+ "epoch": 0.9040793825799338,
358
+ "grad_norm": 0.4121669828891754,
359
+ "learning_rate": 9.531313203086502e-07,
360
+ "loss": 0.9197,
361
+ "step": 410
362
+ },
363
+ {
364
+ "epoch": 0.9261300992282249,
365
+ "grad_norm": 0.3806382417678833,
366
+ "learning_rate": 9.489841792617239e-07,
367
+ "loss": 0.9206,
368
+ "step": 420
369
+ },
370
+ {
371
+ "epoch": 0.948180815876516,
372
+ "grad_norm": 0.4550821781158447,
373
+ "learning_rate": 9.446710744412594e-07,
374
+ "loss": 0.9232,
375
+ "step": 430
376
+ },
377
+ {
378
+ "epoch": 0.9702315325248071,
379
+ "grad_norm": 0.49195268750190735,
380
+ "learning_rate": 9.401936001554413e-07,
381
+ "loss": 0.8856,
382
+ "step": 440
383
+ },
384
+ {
385
+ "epoch": 0.9922822491730982,
386
+ "grad_norm": 0.47510039806365967,
387
+ "learning_rate": 9.355534114704451e-07,
388
+ "loss": 0.7298,
389
+ "step": 450
390
+ },
391
+ {
392
+ "epoch": 0.9922822491730982,
393
+ "eval_loss": 0.9063670039176941,
394
+ "eval_runtime": 30.9858,
395
+ "eval_samples_per_second": 12.49,
396
+ "eval_steps_per_second": 6.261,
397
+ "step": 450
398
+ },
399
+ {
400
+ "epoch": 1.0132304299889747,
401
+ "grad_norm": 0.45156633853912354,
402
+ "learning_rate": 9.307522235986536e-07,
403
+ "loss": 0.9249,
404
+ "step": 460
405
+ },
406
+ {
407
+ "epoch": 1.0352811466372658,
408
+ "grad_norm": 0.41880857944488525,
409
+ "learning_rate": 9.25791811264642e-07,
410
+ "loss": 0.8693,
411
+ "step": 470
412
+ },
413
+ {
414
+ "epoch": 1.0573318632855568,
415
+ "grad_norm": 0.418094664812088,
416
+ "learning_rate": 9.206740080491626e-07,
417
+ "loss": 0.8577,
418
+ "step": 480
419
+ },
420
+ {
421
+ "epoch": 1.079382579933848,
422
+ "grad_norm": 0.5093225836753845,
423
+ "learning_rate": 9.154007057113755e-07,
424
+ "loss": 0.7593,
425
+ "step": 490
426
+ },
427
+ {
428
+ "epoch": 1.101433296582139,
429
+ "grad_norm": 0.33123084902763367,
430
+ "learning_rate": 9.099738534895736e-07,
431
+ "loss": 0.8276,
432
+ "step": 500
433
+ },
434
+ {
435
+ "epoch": 1.101433296582139,
436
+ "eval_loss": 0.8895114064216614,
437
+ "eval_runtime": 30.7938,
438
+ "eval_samples_per_second": 12.567,
439
+ "eval_steps_per_second": 6.3,
440
+ "step": 500
441
+ },
442
+ {
443
+ "epoch": 1.12348401323043,
444
+ "grad_norm": 0.4701198637485504,
445
+ "learning_rate": 9.043954573806596e-07,
446
+ "loss": 0.8623,
447
+ "step": 510
448
+ },
449
+ {
450
+ "epoch": 1.145534729878721,
451
+ "grad_norm": 0.4642221927642822,
452
+ "learning_rate": 8.986675793986451e-07,
453
+ "loss": 0.8981,
454
+ "step": 520
455
+ },
456
+ {
457
+ "epoch": 1.1675854465270121,
458
+ "grad_norm": 0.4403532147407532,
459
+ "learning_rate": 8.927923368124411e-07,
460
+ "loss": 0.8752,
461
+ "step": 530
462
+ },
463
+ {
464
+ "epoch": 1.1896361631753032,
465
+ "grad_norm": 0.542199969291687,
466
+ "learning_rate": 8.867719013632246e-07,
467
+ "loss": 0.831,
468
+ "step": 540
469
+ },
470
+ {
471
+ "epoch": 1.2116868798235942,
472
+ "grad_norm": 0.382845014333725,
473
+ "learning_rate": 8.806084984616712e-07,
474
+ "loss": 0.887,
475
+ "step": 550
476
+ },
477
+ {
478
+ "epoch": 1.2116868798235942,
479
+ "eval_loss": 0.8739157915115356,
480
+ "eval_runtime": 30.8799,
481
+ "eval_samples_per_second": 12.532,
482
+ "eval_steps_per_second": 6.282,
483
+ "step": 550
484
+ },
485
+ {
486
+ "epoch": 1.2337375964718853,
487
+ "grad_norm": 0.4585653245449066,
488
+ "learning_rate": 8.743044063653465e-07,
489
+ "loss": 0.8997,
490
+ "step": 560
491
+ },
492
+ {
493
+ "epoch": 1.2557883131201764,
494
+ "grad_norm": 0.35734766721725464,
495
+ "learning_rate": 8.678619553365658e-07,
496
+ "loss": 0.7794,
497
+ "step": 570
498
+ },
499
+ {
500
+ "epoch": 1.2778390297684674,
501
+ "grad_norm": 0.38038501143455505,
502
+ "learning_rate": 8.612835267810286e-07,
503
+ "loss": 0.7101,
504
+ "step": 580
505
+ },
506
+ {
507
+ "epoch": 1.2998897464167585,
508
+ "grad_norm": 0.5496143698692322,
509
+ "learning_rate": 8.5457155236755e-07,
510
+ "loss": 0.901,
511
+ "step": 590
512
+ },
513
+ {
514
+ "epoch": 1.3219404630650495,
515
+ "grad_norm": 0.36970055103302,
516
+ "learning_rate": 8.477285131292107e-07,
517
+ "loss": 0.8555,
518
+ "step": 600
519
+ },
520
+ {
521
+ "epoch": 1.3219404630650495,
522
+ "eval_loss": 0.8605629205703735,
523
+ "eval_runtime": 31.8221,
524
+ "eval_samples_per_second": 12.161,
525
+ "eval_steps_per_second": 6.096,
526
+ "step": 600
527
+ },
528
+ {
529
+ "epoch": 1.3439911797133406,
530
+ "grad_norm": 0.3344724476337433,
531
+ "learning_rate": 8.407569385462614e-07,
532
+ "loss": 0.899,
533
+ "step": 610
534
+ },
535
+ {
536
+ "epoch": 1.3660418963616316,
537
+ "grad_norm": 0.359423965215683,
538
+ "learning_rate": 8.336594056111197e-07,
539
+ "loss": 0.8091,
540
+ "step": 620
541
+ },
542
+ {
543
+ "epoch": 1.3880926130099227,
544
+ "grad_norm": 0.33909741044044495,
545
+ "learning_rate": 8.264385378758013e-07,
546
+ "loss": 0.7466,
547
+ "step": 630
548
+ },
549
+ {
550
+ "epoch": 1.4101433296582138,
551
+ "grad_norm": 0.3617461025714874,
552
+ "learning_rate": 8.190970044821446e-07,
553
+ "loss": 0.7555,
554
+ "step": 640
555
+ },
556
+ {
557
+ "epoch": 1.432194046306505,
558
+ "grad_norm": 0.5685471296310425,
559
+ "learning_rate": 8.116375191751807e-07,
560
+ "loss": 0.8141,
561
+ "step": 650
562
+ },
563
+ {
564
+ "epoch": 1.432194046306505,
565
+ "eval_loss": 0.8483839631080627,
566
+ "eval_runtime": 31.1689,
567
+ "eval_samples_per_second": 12.416,
568
+ "eval_steps_per_second": 6.224,
569
+ "step": 650
570
+ },
571
+ {
572
+ "epoch": 1.454244762954796,
573
+ "grad_norm": 0.6284070611000061,
574
+ "learning_rate": 8.040628393000177e-07,
575
+ "loss": 0.8376,
576
+ "step": 660
577
+ },
578
+ {
579
+ "epoch": 1.4762954796030872,
580
+ "grad_norm": 0.4221034049987793,
581
+ "learning_rate": 7.963757647826071e-07,
582
+ "loss": 0.9069,
583
+ "step": 670
584
+ },
585
+ {
586
+ "epoch": 1.4983461962513782,
587
+ "grad_norm": 0.4318729341030121,
588
+ "learning_rate": 7.88579137094772e-07,
589
+ "loss": 0.8511,
590
+ "step": 680
591
+ },
592
+ {
593
+ "epoch": 1.5203969128996693,
594
+ "grad_norm": 0.4035641551017761,
595
+ "learning_rate": 7.806758382038772e-07,
596
+ "loss": 0.7949,
597
+ "step": 690
598
+ },
599
+ {
600
+ "epoch": 1.5424476295479603,
601
+ "grad_norm": 0.37978196144104004,
602
+ "learning_rate": 7.726687895075308e-07,
603
+ "loss": 0.8521,
604
+ "step": 700
605
+ },
606
+ {
607
+ "epoch": 1.5424476295479603,
608
+ "eval_loss": 0.8365304470062256,
609
+ "eval_runtime": 30.9818,
610
+ "eval_samples_per_second": 12.491,
611
+ "eval_steps_per_second": 6.262,
612
+ "step": 700
613
+ },
614
+ {
615
+ "epoch": 1.5644983461962514,
616
+ "grad_norm": 0.405729204416275,
617
+ "learning_rate": 7.645609507537105e-07,
618
+ "loss": 0.8085,
619
+ "step": 710
620
+ },
621
+ {
622
+ "epoch": 1.5865490628445424,
623
+ "grad_norm": 1.3865073919296265,
624
+ "learning_rate": 7.563553189467136e-07,
625
+ "loss": 0.9756,
626
+ "step": 720
627
+ },
628
+ {
629
+ "epoch": 1.6085997794928335,
630
+ "grad_norm": 0.4169498682022095,
631
+ "learning_rate": 7.480549272393371e-07,
632
+ "loss": 0.8028,
633
+ "step": 730
634
+ },
635
+ {
636
+ "epoch": 1.6306504961411246,
637
+ "grad_norm": 0.7300071716308594,
638
+ "learning_rate": 7.39662843811693e-07,
639
+ "loss": 0.8578,
640
+ "step": 740
641
+ },
642
+ {
643
+ "epoch": 1.6527012127894156,
644
+ "grad_norm": 0.4267706573009491,
645
+ "learning_rate": 7.311821707370792e-07,
646
+ "loss": 0.7597,
647
+ "step": 750
648
+ },
649
+ {
650
+ "epoch": 1.6527012127894156,
651
+ "eval_loss": 0.8256433010101318,
652
+ "eval_runtime": 30.9848,
653
+ "eval_samples_per_second": 12.49,
654
+ "eval_steps_per_second": 6.261,
655
+ "step": 750
656
+ },
657
+ {
658
+ "epoch": 1.6747519294377067,
659
+ "grad_norm": 0.4179936945438385,
660
+ "learning_rate": 7.226160428353189e-07,
661
+ "loss": 0.8828,
662
+ "step": 760
663
+ },
664
+ {
665
+ "epoch": 1.696802646085998,
666
+ "grad_norm": 0.5068164467811584,
667
+ "learning_rate": 7.139676265139985e-07,
668
+ "loss": 0.7515,
669
+ "step": 770
670
+ },
671
+ {
672
+ "epoch": 1.718853362734289,
673
+ "grad_norm": 0.4229294955730438,
674
+ "learning_rate": 7.05240118598026e-07,
675
+ "loss": 0.766,
676
+ "step": 780
677
+ },
678
+ {
679
+ "epoch": 1.74090407938258,
680
+ "grad_norm": 0.43405506014823914,
681
+ "learning_rate": 6.964367451479497e-07,
682
+ "loss": 0.7944,
683
+ "step": 790
684
+ },
685
+ {
686
+ "epoch": 1.7629547960308711,
687
+ "grad_norm": 0.6145793199539185,
688
+ "learning_rate": 6.875607602674665e-07,
689
+ "loss": 0.76,
690
+ "step": 800
691
+ },
692
+ {
693
+ "epoch": 1.7629547960308711,
694
+ "eval_loss": 0.8161342144012451,
695
+ "eval_runtime": 31.0504,
696
+ "eval_samples_per_second": 12.464,
697
+ "eval_steps_per_second": 6.248,
698
+ "step": 800
699
+ },
700
+ {
701
+ "epoch": 1.7850055126791622,
702
+ "grad_norm": 0.5480120182037354,
703
+ "learning_rate": 6.786154449005663e-07,
704
+ "loss": 0.6947,
705
+ "step": 810
706
+ },
707
+ {
708
+ "epoch": 1.8070562293274532,
709
+ "grad_norm": 0.4307333827018738,
710
+ "learning_rate": 6.696041056187554e-07,
711
+ "loss": 0.8518,
712
+ "step": 820
713
+ },
714
+ {
715
+ "epoch": 1.8291069459757443,
716
+ "grad_norm": 0.48760783672332764,
717
+ "learning_rate": 6.605300733988051e-07,
718
+ "loss": 0.9018,
719
+ "step": 830
720
+ },
721
+ {
722
+ "epoch": 1.8511576626240354,
723
+ "grad_norm": 0.4379078447818756,
724
+ "learning_rate": 6.513967023914807e-07,
725
+ "loss": 0.7699,
726
+ "step": 840
727
+ },
728
+ {
729
+ "epoch": 1.8732083792723264,
730
+ "grad_norm": 0.5063772201538086,
731
+ "learning_rate": 6.422073686817032e-07,
732
+ "loss": 0.7034,
733
+ "step": 850
734
+ },
735
+ {
736
+ "epoch": 1.8732083792723264,
737
+ "eval_loss": 0.8075295090675354,
738
+ "eval_runtime": 31.4599,
739
+ "eval_samples_per_second": 12.301,
740
+ "eval_steps_per_second": 6.167,
741
+ "step": 850
742
+ },
743
+ {
744
+ "epoch": 1.8952590959206175,
745
+ "grad_norm": 0.39252328872680664,
746
+ "learning_rate": 6.329654690406045e-07,
747
+ "loss": 0.7019,
748
+ "step": 860
749
+ },
750
+ {
751
+ "epoch": 1.9173098125689085,
752
+ "grad_norm": 1.636116862297058,
753
+ "learning_rate": 6.236744196699353e-07,
754
+ "loss": 0.7102,
755
+ "step": 870
756
+ },
757
+ {
758
+ "epoch": 1.9393605292171996,
759
+ "grad_norm": 0.3736509084701538,
760
+ "learning_rate": 6.143376549392898e-07,
761
+ "loss": 0.8171,
762
+ "step": 880
763
+ },
764
+ {
765
+ "epoch": 1.9614112458654906,
766
+ "grad_norm": 0.4188951551914215,
767
+ "learning_rate": 6.049586261166168e-07,
768
+ "loss": 0.7083,
769
+ "step": 890
770
+ },
771
+ {
772
+ "epoch": 1.9834619625137817,
773
+ "grad_norm": 0.5519161224365234,
774
+ "learning_rate": 5.955408000924826e-07,
775
+ "loss": 0.8761,
776
+ "step": 900
777
+ },
778
+ {
779
+ "epoch": 1.9834619625137817,
780
+ "eval_loss": 0.7999474406242371,
781
+ "eval_runtime": 31.2242,
782
+ "eval_samples_per_second": 12.394,
783
+ "eval_steps_per_second": 6.213,
784
+ "step": 900
785
+ },
786
+ {
787
+ "epoch": 2.0044101433296584,
788
+ "grad_norm": 0.48845574259757996,
789
+ "learning_rate": 5.860876580985605e-07,
790
+ "loss": 0.7645,
791
+ "step": 910
792
+ },
793
+ {
794
+ "epoch": 2.0264608599779494,
795
+ "grad_norm": 0.4153165817260742,
796
+ "learning_rate": 5.766026944208171e-07,
797
+ "loss": 0.7235,
798
+ "step": 920
799
+ },
800
+ {
801
+ "epoch": 2.0485115766262405,
802
+ "grad_norm": 0.4561828076839447,
803
+ "learning_rate": 5.670894151078768e-07,
804
+ "loss": 0.8001,
805
+ "step": 930
806
+ },
807
+ {
808
+ "epoch": 2.0705622932745316,
809
+ "grad_norm": 0.45872876048088074,
810
+ "learning_rate": 5.575513366750338e-07,
811
+ "loss": 0.6565,
812
+ "step": 940
813
+ },
814
+ {
815
+ "epoch": 2.0926130099228226,
816
+ "grad_norm": 0.49032464623451233,
817
+ "learning_rate": 5.479919848044e-07,
818
+ "loss": 0.7339,
819
+ "step": 950
820
+ },
821
+ {
822
+ "epoch": 2.0926130099228226,
823
+ "eval_loss": 0.7934562563896179,
824
+ "eval_runtime": 31.4243,
825
+ "eval_samples_per_second": 12.315,
826
+ "eval_steps_per_second": 6.174,
827
+ "step": 950
828
+ },
829
+ {
830
+ "epoch": 2.1146637265711137,
831
+ "grad_norm": 0.4034031629562378,
832
+ "learning_rate": 5.384148930416592e-07,
833
+ "loss": 0.7465,
834
+ "step": 960
835
+ },
836
+ {
837
+ "epoch": 2.1367144432194047,
838
+ "grad_norm": 0.5809303522109985,
839
+ "learning_rate": 5.288236014899199e-07,
840
+ "loss": 0.7062,
841
+ "step": 970
842
+ },
843
+ {
844
+ "epoch": 2.158765159867696,
845
+ "grad_norm": 0.5515176653862,
846
+ "learning_rate": 5.1922165550114e-07,
847
+ "loss": 0.7657,
848
+ "step": 980
849
+ },
850
+ {
851
+ "epoch": 2.180815876515987,
852
+ "grad_norm": 0.4212632477283478,
853
+ "learning_rate": 5.096126043656131e-07,
854
+ "loss": 0.7649,
855
+ "step": 990
856
+ },
857
+ {
858
+ "epoch": 2.202866593164278,
859
+ "grad_norm": 0.43177521228790283,
860
+ "learning_rate": 5e-07,
861
+ "loss": 0.7089,
862
+ "step": 1000
863
+ },
864
+ {
865
+ "epoch": 2.202866593164278,
866
+ "eval_loss": 0.7877091765403748,
867
+ "eval_runtime": 31.443,
868
+ "eval_samples_per_second": 12.308,
869
+ "eval_steps_per_second": 6.17,
870
+ "step": 1000
871
+ },
872
+ {
873
+ "epoch": 2.224917309812569,
874
+ "grad_norm": 1.1922564506530762,
875
+ "learning_rate": 4.903873956343869e-07,
876
+ "loss": 0.776,
877
+ "step": 1010
878
+ },
879
+ {
880
+ "epoch": 2.24696802646086,
881
+ "grad_norm": 0.4026447832584381,
882
+ "learning_rate": 4.8077834449886e-07,
883
+ "loss": 0.7671,
884
+ "step": 1020
885
+ },
886
+ {
887
+ "epoch": 2.269018743109151,
888
+ "grad_norm": 0.4751526415348053,
889
+ "learning_rate": 4.711763985100801e-07,
890
+ "loss": 0.6684,
891
+ "step": 1030
892
+ },
893
+ {
894
+ "epoch": 2.291069459757442,
895
+ "grad_norm": 0.6750423312187195,
896
+ "learning_rate": 4.6158510695834073e-07,
897
+ "loss": 0.8802,
898
+ "step": 1040
899
+ },
900
+ {
901
+ "epoch": 2.313120176405733,
902
+ "grad_norm": 0.6013314723968506,
903
+ "learning_rate": 4.520080151956001e-07,
904
+ "loss": 0.7591,
905
+ "step": 1050
906
+ },
907
+ {
908
+ "epoch": 2.313120176405733,
909
+ "eval_loss": 0.7824860215187073,
910
+ "eval_runtime": 30.8951,
911
+ "eval_samples_per_second": 12.526,
912
+ "eval_steps_per_second": 6.279,
913
+ "step": 1050
914
+ },
915
+ {
916
+ "epoch": 2.3351708930540243,
917
+ "grad_norm": 0.6955768465995789,
918
+ "learning_rate": 4.4244866332496606e-07,
919
+ "loss": 0.8971,
920
+ "step": 1060
921
+ },
922
+ {
923
+ "epoch": 2.3572216097023153,
924
+ "grad_norm": 0.5380210876464844,
925
+ "learning_rate": 4.329105848921233e-07,
926
+ "loss": 0.7925,
927
+ "step": 1070
928
+ },
929
+ {
930
+ "epoch": 2.3792723263506064,
931
+ "grad_norm": 0.5366503000259399,
932
+ "learning_rate": 4.23397305579183e-07,
933
+ "loss": 0.7786,
934
+ "step": 1080
935
+ },
936
+ {
937
+ "epoch": 2.4013230429988974,
938
+ "grad_norm": 0.5527025461196899,
939
+ "learning_rate": 4.139123419014396e-07,
940
+ "loss": 0.8109,
941
+ "step": 1090
942
+ },
943
+ {
944
+ "epoch": 2.4233737596471885,
945
+ "grad_norm": 0.4577449560165405,
946
+ "learning_rate": 4.044591999075172e-07,
947
+ "loss": 0.6928,
948
+ "step": 1100
949
+ },
950
+ {
951
+ "epoch": 2.4233737596471885,
952
+ "eval_loss": 0.7777835130691528,
953
+ "eval_runtime": 31.369,
954
+ "eval_samples_per_second": 12.337,
955
+ "eval_steps_per_second": 6.184,
956
+ "step": 1100
957
+ },
958
+ {
959
+ "epoch": 2.4454244762954795,
960
+ "grad_norm": 0.5419116616249084,
961
+ "learning_rate": 3.950413738833831e-07,
962
+ "loss": 0.9256,
963
+ "step": 1110
964
+ },
965
+ {
966
+ "epoch": 2.4674751929437706,
967
+ "grad_norm": 0.6707462072372437,
968
+ "learning_rate": 3.8566234506071025e-07,
969
+ "loss": 0.6892,
970
+ "step": 1120
971
+ },
972
+ {
973
+ "epoch": 2.4895259095920617,
974
+ "grad_norm": 0.5671144723892212,
975
+ "learning_rate": 3.763255803300646e-07,
976
+ "loss": 0.711,
977
+ "step": 1130
978
+ },
979
+ {
980
+ "epoch": 2.5115766262403527,
981
+ "grad_norm": 0.45249974727630615,
982
+ "learning_rate": 3.670345309593954e-07,
983
+ "loss": 0.8348,
984
+ "step": 1140
985
+ },
986
+ {
987
+ "epoch": 2.5336273428886438,
988
+ "grad_norm": 0.42139536142349243,
989
+ "learning_rate": 3.5779263131829687e-07,
990
+ "loss": 0.7553,
991
+ "step": 1150
992
+ },
993
+ {
994
+ "epoch": 2.5336273428886438,
995
+ "eval_loss": 0.7735557556152344,
996
+ "eval_runtime": 31.1989,
997
+ "eval_samples_per_second": 12.404,
998
+ "eval_steps_per_second": 6.218,
999
+ "step": 1150
1000
+ },
1001
+ {
1002
+ "epoch": 2.555678059536935,
1003
+ "grad_norm": 0.44612178206443787,
1004
+ "learning_rate": 3.4860329760851946e-07,
1005
+ "loss": 0.7264,
1006
+ "step": 1160
1007
+ },
1008
+ {
1009
+ "epoch": 2.577728776185226,
1010
+ "grad_norm": 0.40486013889312744,
1011
+ "learning_rate": 3.3946992660119466e-07,
1012
+ "loss": 0.6216,
1013
+ "step": 1170
1014
+ },
1015
+ {
1016
+ "epoch": 2.599779492833517,
1017
+ "grad_norm": 0.4542286694049835,
1018
+ "learning_rate": 3.303958943812445e-07,
1019
+ "loss": 0.7504,
1020
+ "step": 1180
1021
+ },
1022
+ {
1023
+ "epoch": 2.621830209481808,
1024
+ "grad_norm": 0.4070703387260437,
1025
+ "learning_rate": 3.213845550994336e-07,
1026
+ "loss": 0.6421,
1027
+ "step": 1190
1028
+ },
1029
+ {
1030
+ "epoch": 2.643880926130099,
1031
+ "grad_norm": 0.6060696840286255,
1032
+ "learning_rate": 3.1243923973253363e-07,
1033
+ "loss": 0.7927,
1034
+ "step": 1200
1035
+ },
1036
+ {
1037
+ "epoch": 2.643880926130099,
1038
+ "eval_loss": 0.770568311214447,
1039
+ "eval_runtime": 31.178,
1040
+ "eval_samples_per_second": 12.413,
1041
+ "eval_steps_per_second": 6.222,
1042
+ "step": 1200
1043
+ },
1044
+ {
1045
+ "epoch": 2.66593164277839,
1046
+ "grad_norm": 0.5011975765228271,
1047
+ "learning_rate": 3.035632548520502e-07,
1048
+ "loss": 0.768,
1049
+ "step": 1210
1050
+ },
1051
+ {
1052
+ "epoch": 2.687982359426681,
1053
+ "grad_norm": 0.49224653840065,
1054
+ "learning_rate": 2.9475988140197396e-07,
1055
+ "loss": 0.7311,
1056
+ "step": 1220
1057
+ },
1058
+ {
1059
+ "epoch": 2.7100330760749722,
1060
+ "grad_norm": 0.3389280140399933,
1061
+ "learning_rate": 2.860323734860016e-07,
1062
+ "loss": 0.7756,
1063
+ "step": 1230
1064
+ },
1065
+ {
1066
+ "epoch": 2.7320837927232633,
1067
+ "grad_norm": 0.49108514189720154,
1068
+ "learning_rate": 2.7738395716468117e-07,
1069
+ "loss": 0.7258,
1070
+ "step": 1240
1071
+ },
1072
+ {
1073
+ "epoch": 2.7541345093715544,
1074
+ "grad_norm": 0.6292597055435181,
1075
+ "learning_rate": 2.6881782926292083e-07,
1076
+ "loss": 0.797,
1077
+ "step": 1250
1078
+ },
1079
+ {
1080
+ "epoch": 2.7541345093715544,
1081
+ "eval_loss": 0.7677077054977417,
1082
+ "eval_runtime": 31.885,
1083
+ "eval_samples_per_second": 12.137,
1084
+ "eval_steps_per_second": 6.084,
1085
+ "step": 1250
1086
+ },
1087
+ {
1088
+ "epoch": 2.7761852260198454,
1089
+ "grad_norm": 0.626222550868988,
1090
+ "learning_rate": 2.6033715618830693e-07,
1091
+ "loss": 0.834,
1092
+ "step": 1260
1093
+ },
1094
+ {
1095
+ "epoch": 2.7982359426681365,
1096
+ "grad_norm": 0.6133826375007629,
1097
+ "learning_rate": 2.51945072760663e-07,
1098
+ "loss": 0.8028,
1099
+ "step": 1270
1100
+ },
1101
+ {
1102
+ "epoch": 2.8202866593164275,
1103
+ "grad_norm": 0.5342646241188049,
1104
+ "learning_rate": 2.4364468105328633e-07,
1105
+ "loss": 0.7708,
1106
+ "step": 1280
1107
+ },
1108
+ {
1109
+ "epoch": 2.8423373759647186,
1110
+ "grad_norm": 0.39017361402511597,
1111
+ "learning_rate": 2.3543904924628961e-07,
1112
+ "loss": 0.6769,
1113
+ "step": 1290
1114
+ },
1115
+ {
1116
+ "epoch": 2.86438809261301,
1117
+ "grad_norm": 0.6175335049629211,
1118
+ "learning_rate": 2.2733121049246912e-07,
1119
+ "loss": 0.7685,
1120
+ "step": 1300
1121
+ },
1122
+ {
1123
+ "epoch": 2.86438809261301,
1124
+ "eval_loss": 0.7651455402374268,
1125
+ "eval_runtime": 32.1286,
1126
+ "eval_samples_per_second": 12.045,
1127
+ "eval_steps_per_second": 6.038,
1128
+ "step": 1300
1129
+ },
1130
+ {
1131
+ "epoch": 2.886438809261301,
1132
+ "grad_norm": 0.6804171204566956,
1133
+ "learning_rate": 2.1932416179612284e-07,
1134
+ "loss": 0.7988,
1135
+ "step": 1310
1136
+ },
1137
+ {
1138
+ "epoch": 2.908489525909592,
1139
+ "grad_norm": 0.4666235148906708,
1140
+ "learning_rate": 2.1142086290522798e-07,
1141
+ "loss": 0.7256,
1142
+ "step": 1320
1143
+ },
1144
+ {
1145
+ "epoch": 2.9305402425578833,
1146
+ "grad_norm": 0.4415511190891266,
1147
+ "learning_rate": 2.036242352173928e-07,
1148
+ "loss": 0.8247,
1149
+ "step": 1330
1150
+ },
1151
+ {
1152
+ "epoch": 2.9525909592061743,
1153
+ "grad_norm": 0.7598405480384827,
1154
+ "learning_rate": 1.959371606999823e-07,
1155
+ "loss": 0.7665,
1156
+ "step": 1340
1157
+ },
1158
+ {
1159
+ "epoch": 2.9746416758544654,
1160
+ "grad_norm": 0.8235193490982056,
1161
+ "learning_rate": 1.8836248082481932e-07,
1162
+ "loss": 0.7006,
1163
+ "step": 1350
1164
+ },
1165
+ {
1166
+ "epoch": 2.9746416758544654,
1167
+ "eval_loss": 0.7628360986709595,
1168
+ "eval_runtime": 31.6956,
1169
+ "eval_samples_per_second": 12.21,
1170
+ "eval_steps_per_second": 6.121,
1171
+ "step": 1350
1172
+ },
1173
+ {
1174
+ "epoch": 2.9966923925027564,
1175
+ "grad_norm": 0.47277387976646423,
1176
+ "learning_rate": 1.8090299551785543e-07,
1177
+ "loss": 0.7231,
1178
+ "step": 1360
1179
+ },
1180
+ {
1181
+ "epoch": 3.0176405733186327,
1182
+ "grad_norm": 0.46243688464164734,
1183
+ "learning_rate": 1.7356146212419865e-07,
1184
+ "loss": 0.7582,
1185
+ "step": 1370
1186
+ },
1187
+ {
1188
+ "epoch": 3.0396912899669237,
1189
+ "grad_norm": 0.5353012084960938,
1190
+ "learning_rate": 1.6634059438888032e-07,
1191
+ "loss": 0.6686,
1192
+ "step": 1380
1193
+ },
1194
+ {
1195
+ "epoch": 3.061742006615215,
1196
+ "grad_norm": 0.5801079869270325,
1197
+ "learning_rate": 1.5924306145373845e-07,
1198
+ "loss": 0.7798,
1199
+ "step": 1390
1200
+ },
1201
+ {
1202
+ "epoch": 3.083792723263506,
1203
+ "grad_norm": 0.49165478348731995,
1204
+ "learning_rate": 1.5227148687078943e-07,
1205
+ "loss": 0.7124,
1206
+ "step": 1400
1207
+ },
1208
+ {
1209
+ "epoch": 3.083792723263506,
1210
+ "eval_loss": 0.761391818523407,
1211
+ "eval_runtime": 32.2827,
1212
+ "eval_samples_per_second": 11.988,
1213
+ "eval_steps_per_second": 6.009,
1214
+ "step": 1400
1215
+ },
1216
+ {
1217
+ "epoch": 3.1058434399117973,
1218
+ "grad_norm": 0.5716243982315063,
1219
+ "learning_rate": 1.4542844763245e-07,
1220
+ "loss": 0.7299,
1221
+ "step": 1410
1222
+ },
1223
+ {
1224
+ "epoch": 3.1278941565600884,
1225
+ "grad_norm": 0.43414145708084106,
1226
+ "learning_rate": 1.3871647321897134e-07,
1227
+ "loss": 0.8703,
1228
+ "step": 1420
1229
+ },
1230
+ {
1231
+ "epoch": 3.1499448732083795,
1232
+ "grad_norm": 0.4742828905582428,
1233
+ "learning_rate": 1.321380446634342e-07,
1234
+ "loss": 0.7103,
1235
+ "step": 1430
1236
+ },
1237
+ {
1238
+ "epoch": 3.1719955898566705,
1239
+ "grad_norm": 0.4828404188156128,
1240
+ "learning_rate": 1.2569559363465349e-07,
1241
+ "loss": 0.7584,
1242
+ "step": 1440
1243
+ },
1244
+ {
1245
+ "epoch": 3.1940463065049616,
1246
+ "grad_norm": 0.5527871251106262,
1247
+ "learning_rate": 1.1939150153832878e-07,
1248
+ "loss": 0.6797,
1249
+ "step": 1450
1250
+ },
1251
+ {
1252
+ "epoch": 3.1940463065049616,
1253
+ "eval_loss": 0.7600497603416443,
1254
+ "eval_runtime": 31.6186,
1255
+ "eval_samples_per_second": 12.24,
1256
+ "eval_steps_per_second": 6.136,
1257
+ "step": 1450
1258
+ },
1259
+ {
1260
+ "epoch": 3.2160970231532526,
1261
+ "grad_norm": 0.6190243363380432,
1262
+ "learning_rate": 1.132280986367754e-07,
1263
+ "loss": 0.8045,
1264
+ "step": 1460
1265
+ },
1266
+ {
1267
+ "epoch": 3.2381477398015437,
1268
+ "grad_norm": 0.4672952890396118,
1269
+ "learning_rate": 1.0720766318755897e-07,
1270
+ "loss": 0.6964,
1271
+ "step": 1470
1272
+ },
1273
+ {
1274
+ "epoch": 3.2601984564498347,
1275
+ "grad_norm": 0.5040985345840454,
1276
+ "learning_rate": 1.01332420601355e-07,
1277
+ "loss": 0.6955,
1278
+ "step": 1480
1279
+ },
1280
+ {
1281
+ "epoch": 3.282249173098126,
1282
+ "grad_norm": 0.4153273403644562,
1283
+ "learning_rate": 9.560454261934048e-08,
1284
+ "loss": 0.684,
1285
+ "step": 1490
1286
+ },
1287
+ {
1288
+ "epoch": 3.304299889746417,
1289
+ "grad_norm": 0.6224635243415833,
1290
+ "learning_rate": 9.00261465104264e-08,
1291
+ "loss": 0.8161,
1292
+ "step": 1500
1293
+ },
1294
+ {
1295
+ "epoch": 3.304299889746417,
1296
+ "eval_loss": 0.7590056657791138,
1297
+ "eval_runtime": 31.0884,
1298
+ "eval_samples_per_second": 12.448,
1299
+ "eval_steps_per_second": 6.24,
1300
+ "step": 1500
1301
+ },
1302
+ {
1303
+ "epoch": 3.326350606394708,
1304
+ "grad_norm": 0.49411261081695557,
1305
+ "learning_rate": 8.45992942886244e-08,
1306
+ "loss": 0.7435,
1307
+ "step": 1510
1308
+ },
1309
+ {
1310
+ "epoch": 3.348401323042999,
1311
+ "grad_norm": 0.5820800065994263,
1312
+ "learning_rate": 7.932599195083744e-08,
1313
+ "loss": 0.6804,
1314
+ "step": 1520
1315
+ },
1316
+ {
1317
+ "epoch": 3.37045203969129,
1318
+ "grad_norm": 0.4719645380973816,
1319
+ "learning_rate": 7.4208188735358e-08,
1320
+ "loss": 0.6657,
1321
+ "step": 1530
1322
+ },
1323
+ {
1324
+ "epoch": 3.392502756339581,
1325
+ "grad_norm": 0.48559075593948364,
1326
+ "learning_rate": 6.924777640134627e-08,
1327
+ "loss": 0.7233,
1328
+ "step": 1540
1329
+ },
1330
+ {
1331
+ "epoch": 3.414553472987872,
1332
+ "grad_norm": 0.4606567621231079,
1333
+ "learning_rate": 6.444658852955492e-08,
1334
+ "loss": 0.786,
1335
+ "step": 1550
1336
+ },
1337
+ {
1338
+ "epoch": 3.414553472987872,
1339
+ "eval_loss": 0.7584723830223083,
1340
+ "eval_runtime": 31.5971,
1341
+ "eval_samples_per_second": 12.248,
1342
+ "eval_steps_per_second": 6.14,
1343
+ "step": 1550
1344
+ },
1345
+ {
1346
+ "epoch": 3.436604189636163,
1347
+ "grad_norm": 0.6529830098152161,
1348
+ "learning_rate": 5.98063998445587e-08,
1349
+ "loss": 0.7732,
1350
+ "step": 1560
1351
+ },
1352
+ {
1353
+ "epoch": 3.4586549062844543,
1354
+ "grad_norm": 0.538375973701477,
1355
+ "learning_rate": 5.532892555874058e-08,
1356
+ "loss": 0.7632,
1357
+ "step": 1570
1358
+ },
1359
+ {
1360
+ "epoch": 3.4807056229327453,
1361
+ "grad_norm": 0.5255725383758545,
1362
+ "learning_rate": 5.1015820738276095e-08,
1363
+ "loss": 0.7578,
1364
+ "step": 1580
1365
+ },
1366
+ {
1367
+ "epoch": 3.5027563395810364,
1368
+ "grad_norm": 0.44121870398521423,
1369
+ "learning_rate": 4.6868679691349775e-08,
1370
+ "loss": 0.61,
1371
+ "step": 1590
1372
+ },
1373
+ {
1374
+ "epoch": 3.5248070562293274,
1375
+ "grad_norm": 0.6190388798713684,
1376
+ "learning_rate": 4.288903537883021e-08,
1377
+ "loss": 0.7699,
1378
+ "step": 1600
1379
+ },
1380
+ {
1381
+ "epoch": 3.5248070562293274,
1382
+ "eval_loss": 0.7578777074813843,
1383
+ "eval_runtime": 30.8062,
1384
+ "eval_samples_per_second": 12.562,
1385
+ "eval_steps_per_second": 6.297,
1386
+ "step": 1600
1387
+ }
1388
+ ],
1389
+ "logging_steps": 10,
1390
+ "max_steps": 1816,
1391
+ "num_input_tokens_seen": 0,
1392
+ "num_train_epochs": 4,
1393
+ "save_steps": 100,
1394
+ "stateful_callbacks": {
1395
+ "TrainerControl": {
1396
+ "args": {
1397
+ "should_epoch_stop": false,
1398
+ "should_evaluate": false,
1399
+ "should_log": false,
1400
+ "should_save": true,
1401
+ "should_training_stop": false
1402
+ },
1403
+ "attributes": {}
1404
+ }
1405
+ },
1406
+ "total_flos": 4.874293166149386e+17,
1407
+ "train_batch_size": 4,
1408
+ "trial_name": null,
1409
+ "trial_params": null
1410
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff