alicegoesdown commited on
Commit
f8ac6b9
·
verified ·
1 Parent(s): 43b4989

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: unsloth/mistral-7b-v0.2-bnb-4bit
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: unsloth/qwen2.5-coder-7b-instruct-bnb-4bit
3
  library_name: peft
4
  ---
5
 
last-checkpoint/adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "unsloth/mistral-7b-v0.2-bnb-4bit",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -10,20 +10,22 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 8,
14
- "lora_dropout": 0.3,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": [
18
  "lm_head"
19
  ],
20
  "peft_type": "LORA",
21
- "r": 4,
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
- "q_proj",
26
- "k_proj"
 
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen2.5-coder-7b-instruct-bnb-4bit",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": [
18
  "lm_head"
19
  ],
20
  "peft_type": "LORA",
21
+ "r": 16,
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
+ "v_proj",
26
+ "k_proj",
27
+ "o_proj",
28
+ "q_proj"
29
  ],
30
  "task_type": "CAUSAL_LM",
31
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f56b62afafa33dfd8b1b6e3a3a411c40052a60f2cb9cba1520f9aa1fb083e197
3
- size 268976704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a35c9bf56b6958852618a347794d9a07c38e3b73b92ca4134c852aefbb9cfc
3
+ size 1130395064
last-checkpoint/added_tokens.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|PAD_TOKEN|>": 151665,
5
+ "<|box_end|>": 151649,
6
+ "<|box_start|>": 151648,
7
+ "<|endoftext|>": 151643,
8
+ "<|file_sep|>": 151664,
9
+ "<|fim_middle|>": 151660,
10
+ "<|fim_pad|>": 151662,
11
+ "<|fim_prefix|>": 151659,
12
+ "<|fim_suffix|>": 151661,
13
+ "<|im_end|>": 151645,
14
+ "<|im_start|>": 151644,
15
+ "<|image_pad|>": 151655,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|video_pad|>": 151656,
22
+ "<|vision_end|>": 151653,
23
+ "<|vision_pad|>": 151654,
24
+ "<|vision_start|>": 151652
25
+ }
last-checkpoint/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3257b4060bbcffa20dbf50b84dbc274de04161f8b3aaf73ab5e72522e67c58d3
3
- size 538028282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe92dd1c86c461ff4f15941bc89ffc7980c15094911a2c40b4a2e40b4b1ebcf0
3
+ size 2260919034
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3b41a1419a7ce1114729ad2a7f33c993b3e0b261358e0b7b2af1aa3a7bbb747
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41fbcdbe5adca339cf0181ba6260a02997afeb1893f4a69f7f561564f16dc030
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a7f437771301009dca72a1d84c876c60a7a5bf93817926c530198c7bc53364b
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4216529ab9188af81d4b0557210a4eede3dcd97c58f39c0c7aa7cf54242dd254
3
  size 1256
last-checkpoint/special_tokens_map.json CHANGED
@@ -1,27 +1,28 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
 
 
 
 
 
 
 
 
9
  "eos_token": {
10
- "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
  "eos_token": {
18
+ "content": "<|im_end|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
  "pad_token": {
25
+ "content": "<|PAD_TOKEN|>",
 
 
 
 
 
 
 
26
  "lstrip": false,
27
  "normalized": false,
28
  "rstrip": false,
last-checkpoint/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2699839c243202a04a90537576fc719283f638e90fe80feb469888275289575
3
- size 3505751
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab42efe8d17406525a9154b728cf9e957629a8ed7ce997770efdd71128c6a1a
3
+ size 11422086
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,26 +1,185 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
  "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
13
  },
14
- "1": {
15
- "content": "<s>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
- "2": {
23
- "content": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
@@ -28,17 +187,31 @@
28
  "special": true
29
  }
30
  },
31
- "bos_token": "<s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "clean_up_tokenization_spaces": false,
33
- "eos_token": "</s>",
 
34
  "extra_special_tokens": {},
35
- "legacy": true,
36
- "model_max_length": 1000000000000000019884624838656,
37
- "pad_token": "<unk>",
38
  "padding_side": "left",
39
- "sp_model_kwargs": {},
40
- "spaces_between_special_tokens": false,
41
- "tokenizer_class": "LlamaTokenizer",
42
- "unk_token": "<unk>",
43
- "use_default_system_prompt": false
44
  }
 
1
  {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
 
4
  "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|PAD_TOKEN|>",
183
  "lstrip": false,
184
  "normalized": false,
185
  "rstrip": false,
 
187
  "special": true
188
  }
189
  },
190
+ "additional_special_tokens": [
191
+ "<|im_start|>",
192
+ "<|im_end|>",
193
+ "<|object_ref_start|>",
194
+ "<|object_ref_end|>",
195
+ "<|box_start|>",
196
+ "<|box_end|>",
197
+ "<|quad_start|>",
198
+ "<|quad_end|>",
199
+ "<|vision_start|>",
200
+ "<|vision_end|>",
201
+ "<|vision_pad|>",
202
+ "<|image_pad|>",
203
+ "<|video_pad|>"
204
+ ],
205
+ "bos_token": null,
206
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
207
  "clean_up_tokenization_spaces": false,
208
+ "eos_token": "<|im_end|>",
209
+ "errors": "replace",
210
  "extra_special_tokens": {},
211
+ "model_max_length": 131072,
212
+ "pad_token": "<|PAD_TOKEN|>",
 
213
  "padding_side": "left",
214
+ "split_special_tokens": false,
215
+ "tokenizer_class": "Qwen2Tokenizer",
216
+ "unk_token": null
 
 
217
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,2052 +1,131 @@
1
  {
2
- "best_metric": 0.6792302131652832,
3
- "best_model_checkpoint": "./output/checkpoint-750",
4
- "epoch": 9.44055944055944,
5
  "eval_steps": 150,
6
- "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03496503496503497,
13
- "grad_norm": 16.180435180664062,
14
- "learning_rate": 2.9999999999999984e-06,
15
- "loss": 0.6729,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.06993006993006994,
20
- "grad_norm": 17.6674861907959,
21
- "learning_rate": 5.999999999999997e-06,
22
- "loss": 0.7485,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.1048951048951049,
27
- "grad_norm": 21.324867248535156,
28
- "learning_rate": 8.999999999999993e-06,
29
- "loss": 0.7391,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.13986013986013987,
34
- "grad_norm": 17.68355941772461,
35
- "learning_rate": 1.1999999999999994e-05,
36
- "loss": 0.533,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.17482517482517482,
41
- "grad_norm": 19.983943939208984,
42
- "learning_rate": 1.499999999999999e-05,
43
- "loss": 0.5843,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.2097902097902098,
48
- "grad_norm": 10.12471866607666,
49
- "learning_rate": 1.7999999999999987e-05,
50
- "loss": 0.7979,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.24475524475524477,
55
- "grad_norm": 18.6708984375,
56
- "learning_rate": 2.0999999999999985e-05,
57
- "loss": 0.683,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.27972027972027974,
62
- "grad_norm": 21.57349395751953,
63
- "learning_rate": 2.3999999999999987e-05,
64
- "loss": 0.7798,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.3146853146853147,
69
- "grad_norm": 12.475003242492676,
70
- "learning_rate": 2.6999999999999982e-05,
71
- "loss": 0.7242,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.34965034965034963,
76
- "grad_norm": 14.752897262573242,
77
- "learning_rate": 2.999999999999998e-05,
78
- "loss": 0.8674,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.38461538461538464,
83
- "grad_norm": 11.63265323638916,
84
- "learning_rate": 2.999969170437547e-05,
85
- "loss": 0.5374,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.4195804195804196,
90
- "grad_norm": 31.700468063354492,
91
- "learning_rate": 2.999876683017477e-05,
92
- "loss": 0.705,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.45454545454545453,
97
- "grad_norm": 12.862920761108398,
98
- "learning_rate": 2.999722541541583e-05,
99
- "loss": 0.6963,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.48951048951048953,
104
- "grad_norm": 42.59748077392578,
105
- "learning_rate": 2.9995067523460178e-05,
106
- "loss": 0.6837,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.5244755244755245,
111
- "grad_norm": 10.075207710266113,
112
- "learning_rate": 2.99922932430103e-05,
113
- "loss": 0.5468,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.5244755244755245,
118
- "eval_loss": 0.7486817240715027,
119
- "eval_runtime": 5.5115,
120
- "eval_samples_per_second": 11.612,
121
- "eval_steps_per_second": 11.612,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.5594405594405595,
126
- "grad_norm": 17.550865173339844,
127
- "learning_rate": 2.9988902688105994e-05,
128
- "loss": 0.7207,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.5944055944055944,
133
- "grad_norm": 23.878662109375,
134
- "learning_rate": 2.9984895998119703e-05,
135
- "loss": 0.6694,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.6293706293706294,
140
- "grad_norm": 17.137025833129883,
141
- "learning_rate": 2.9980273337750747e-05,
142
- "loss": 0.6492,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.6643356643356644,
147
- "grad_norm": 24.956979751586914,
148
- "learning_rate": 2.9975034897018594e-05,
149
- "loss": 0.6865,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.6993006993006993,
154
- "grad_norm": 12.822979927062988,
155
- "learning_rate": 2.9969180891255026e-05,
156
- "loss": 0.7537,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.7342657342657343,
161
- "grad_norm": 14.994546890258789,
162
- "learning_rate": 2.996271156109529e-05,
163
- "loss": 0.53,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.7692307692307693,
168
- "grad_norm": 16.125471115112305,
169
- "learning_rate": 2.995562717246821e-05,
170
- "loss": 0.7043,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.8041958041958042,
175
- "grad_norm": 16.887327194213867,
176
- "learning_rate": 2.9947928016585252e-05,
177
- "loss": 0.6479,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.8391608391608392,
182
- "grad_norm": 12.033827781677246,
183
- "learning_rate": 2.993961440992857e-05,
184
- "loss": 0.5412,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.8741258741258742,
189
- "grad_norm": 12.485661506652832,
190
- "learning_rate": 2.9930686694237954e-05,
191
- "loss": 0.5907,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.9090909090909091,
196
- "grad_norm": 18.46721076965332,
197
- "learning_rate": 2.992114523649684e-05,
198
- "loss": 0.8215,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.9440559440559441,
203
- "grad_norm": 15.521458625793457,
204
- "learning_rate": 2.9910990428917184e-05,
205
- "loss": 0.5217,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.9790209790209791,
210
- "grad_norm": 23.15575408935547,
211
- "learning_rate": 2.9900222688923354e-05,
212
- "loss": 0.7436,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 1.013986013986014,
217
- "grad_norm": 11.568449974060059,
218
- "learning_rate": 2.9888842459134957e-05,
219
- "loss": 0.5377,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 1.048951048951049,
224
- "grad_norm": 14.441850662231445,
225
- "learning_rate": 2.987685020734867e-05,
226
- "loss": 0.5759,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 1.048951048951049,
231
- "eval_loss": 0.7050026059150696,
232
- "eval_runtime": 5.3871,
233
- "eval_samples_per_second": 11.88,
234
- "eval_steps_per_second": 11.88,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 1.083916083916084,
239
- "grad_norm": 14.990555763244629,
240
- "learning_rate": 2.9864246426519002e-05,
241
- "loss": 0.5357,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 1.118881118881119,
246
- "grad_norm": 12.700815200805664,
247
- "learning_rate": 2.985103163473801e-05,
248
- "loss": 0.4844,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 1.1538461538461537,
253
- "grad_norm": 26.03995704650879,
254
- "learning_rate": 2.9837206375214023e-05,
255
- "loss": 0.6256,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 1.1888111888111887,
260
- "grad_norm": 14.494388580322266,
261
- "learning_rate": 2.9822771216249316e-05,
262
- "loss": 0.5045,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 1.2237762237762237,
267
- "grad_norm": 4.644023895263672,
268
- "learning_rate": 2.9807726751216736e-05,
269
- "loss": 0.5306,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 1.2587412587412588,
274
- "grad_norm": 17.606008529663086,
275
- "learning_rate": 2.9792073598535302e-05,
276
- "loss": 0.6347,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 1.2937062937062938,
281
- "grad_norm": 18.099464416503906,
282
- "learning_rate": 2.9775812401644833e-05,
283
- "loss": 0.5037,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 1.3286713286713288,
288
- "grad_norm": 17.8880558013916,
289
- "learning_rate": 2.9758943828979424e-05,
290
- "loss": 0.5738,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 1.3636363636363638,
295
- "grad_norm": 14.929105758666992,
296
- "learning_rate": 2.9741468573940037e-05,
297
- "loss": 0.7208,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 1.3986013986013985,
302
- "grad_norm": 21.71738624572754,
303
- "learning_rate": 2.972338735486596e-05,
304
- "loss": 0.5441,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 1.4335664335664335,
309
- "grad_norm": 15.354290008544922,
310
- "learning_rate": 2.970470091500529e-05,
311
- "loss": 0.4544,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 1.4685314685314685,
316
- "grad_norm": 19.556142807006836,
317
- "learning_rate": 2.9685410022484374e-05,
318
- "loss": 0.5871,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 1.5034965034965035,
323
- "grad_norm": 11.594173431396484,
324
- "learning_rate": 2.9665515470276253e-05,
325
- "loss": 0.6366,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 1.5384615384615383,
330
- "grad_norm": 14.30229663848877,
331
- "learning_rate": 2.9645018076168043e-05,
332
- "loss": 0.6079,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 1.5734265734265733,
337
- "grad_norm": 17.306808471679688,
338
- "learning_rate": 2.9623918682727335e-05,
339
- "loss": 0.4741,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 1.5734265734265733,
344
- "eval_loss": 0.695993185043335,
345
- "eval_runtime": 5.7672,
346
- "eval_samples_per_second": 11.097,
347
- "eval_steps_per_second": 11.097,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 1.6083916083916083,
352
- "grad_norm": 23.68806266784668,
353
- "learning_rate": 2.9602218157267552e-05,
354
- "loss": 0.5325,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 1.6433566433566433,
359
- "grad_norm": 15.258186340332031,
360
- "learning_rate": 2.95799173918123e-05,
361
- "loss": 0.5714,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 1.6783216783216783,
366
- "grad_norm": 10.936037063598633,
367
- "learning_rate": 2.9557017303058703e-05,
368
- "loss": 0.5293,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 1.7132867132867133,
373
- "grad_norm": 18.107650756835938,
374
- "learning_rate": 2.953351883233971e-05,
375
- "loss": 0.5159,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 1.7482517482517483,
380
- "grad_norm": 23.29376792907715,
381
- "learning_rate": 2.9509422945585423e-05,
382
- "loss": 0.6026,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 1.7832167832167833,
387
- "grad_norm": 12.05435562133789,
388
- "learning_rate": 2.9484730633283364e-05,
389
- "loss": 0.4782,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 1.8181818181818183,
394
- "grad_norm": 21.805150985717773,
395
- "learning_rate": 2.945944291043778e-05,
396
- "loss": 0.6525,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 1.8531468531468531,
401
- "grad_norm": 11.176011085510254,
402
- "learning_rate": 2.9433560816527912e-05,
403
- "loss": 0.6669,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 1.8881118881118881,
408
- "grad_norm": 13.804393768310547,
409
- "learning_rate": 2.940708541546527e-05,
410
- "loss": 0.6876,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 1.9230769230769231,
415
- "grad_norm": 13.416976928710938,
416
- "learning_rate": 2.9380017795549888e-05,
417
- "loss": 0.4839,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 1.958041958041958,
422
- "grad_norm": 11.667569160461426,
423
- "learning_rate": 2.935235906942561e-05,
424
- "loss": 0.5205,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 1.993006993006993,
429
- "grad_norm": 30.873056411743164,
430
- "learning_rate": 2.9324110374034337e-05,
431
- "loss": 0.5271,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 2.027972027972028,
436
- "grad_norm": 13.170394897460938,
437
- "learning_rate": 2.929527287056929e-05,
438
- "loss": 0.3298,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 2.062937062937063,
443
- "grad_norm": 9.41249942779541,
444
- "learning_rate": 2.9265847744427285e-05,
445
- "loss": 0.4833,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 2.097902097902098,
450
- "grad_norm": 20.784318923950195,
451
- "learning_rate": 2.923583620516001e-05,
452
- "loss": 0.5354,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 2.097902097902098,
457
- "eval_loss": 0.6886742115020752,
458
- "eval_runtime": 5.7912,
459
- "eval_samples_per_second": 11.051,
460
- "eval_steps_per_second": 11.051,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 2.132867132867133,
465
- "grad_norm": 16.683345794677734,
466
- "learning_rate": 2.92052394864243e-05,
467
- "loss": 0.4066,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 2.167832167832168,
472
- "grad_norm": 11.515765190124512,
473
- "learning_rate": 2.917405884593142e-05,
474
- "loss": 0.3847,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 2.202797202797203,
479
- "grad_norm": 16.18233299255371,
480
- "learning_rate": 2.9142295565395373e-05,
481
- "loss": 0.4465,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 2.237762237762238,
486
- "grad_norm": 12.035266876220703,
487
- "learning_rate": 2.910995095048022e-05,
488
- "loss": 0.461,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 2.2727272727272725,
493
- "grad_norm": 10.977869033813477,
494
- "learning_rate": 2.9077026330746387e-05,
495
- "loss": 0.465,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 2.3076923076923075,
500
- "grad_norm": 20.465116500854492,
501
- "learning_rate": 2.904352305959604e-05,
502
- "loss": 0.4204,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 2.3426573426573425,
507
- "grad_norm": 12.585354804992676,
508
- "learning_rate": 2.900944251421743e-05,
509
- "loss": 0.4709,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 2.3776223776223775,
514
- "grad_norm": 23.60915756225586,
515
- "learning_rate": 2.8974786095528292e-05,
516
- "loss": 0.4938,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 2.4125874125874125,
521
- "grad_norm": 18.205228805541992,
522
- "learning_rate": 2.893955522811826e-05,
523
- "loss": 0.432,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 2.4475524475524475,
528
- "grad_norm": 11.574105262756348,
529
- "learning_rate": 2.8903751360190307e-05,
530
- "loss": 0.4723,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 2.4825174825174825,
535
- "grad_norm": 17.412702560424805,
536
- "learning_rate": 2.8867375963501203e-05,
537
- "loss": 0.4697,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 2.5174825174825175,
542
- "grad_norm": 15.148882865905762,
543
- "learning_rate": 2.883043053330104e-05,
544
- "loss": 0.3637,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 2.5524475524475525,
549
- "grad_norm": 13.45967960357666,
550
- "learning_rate": 2.8792916588271745e-05,
551
- "loss": 0.3917,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 2.5874125874125875,
556
- "grad_norm": 22.990352630615234,
557
- "learning_rate": 2.875483567046466e-05,
558
- "loss": 0.4405,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 2.6223776223776225,
563
- "grad_norm": 14.59798526763916,
564
- "learning_rate": 2.871618934523717e-05,
565
- "loss": 0.5114,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 2.6223776223776225,
570
- "eval_loss": 0.6792302131652832,
571
- "eval_runtime": 5.3969,
572
- "eval_samples_per_second": 11.859,
573
- "eval_steps_per_second": 11.859,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 2.6573426573426575,
578
- "grad_norm": 23.738746643066406,
579
- "learning_rate": 2.8676979201188336e-05,
580
- "loss": 0.5102,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 2.6923076923076925,
585
- "grad_norm": 14.146129608154297,
586
- "learning_rate": 2.8637206850093607e-05,
587
- "loss": 0.4697,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 2.7272727272727275,
592
- "grad_norm": 13.48592758178711,
593
- "learning_rate": 2.8596873926838552e-05,
594
- "loss": 0.5079,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 2.762237762237762,
599
- "grad_norm": 15.442709922790527,
600
- "learning_rate": 2.8555982089351683e-05,
601
- "loss": 0.5034,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 2.797202797202797,
606
- "grad_norm": 17.767261505126953,
607
- "learning_rate": 2.8514533018536265e-05,
608
- "loss": 0.3573,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 2.832167832167832,
613
- "grad_norm": 15.318553924560547,
614
- "learning_rate": 2.8472528418201266e-05,
615
- "loss": 0.5365,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 2.867132867132867,
620
- "grad_norm": 9.855380058288574,
621
- "learning_rate": 2.842997001499128e-05,
622
- "loss": 0.527,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 2.902097902097902,
627
- "grad_norm": 9.013484954833984,
628
- "learning_rate": 2.838685955831557e-05,
629
- "loss": 0.3378,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 2.937062937062937,
634
- "grad_norm": 36.93981170654297,
635
- "learning_rate": 2.834319882027616e-05,
636
- "loss": 0.7142,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 2.972027972027972,
641
- "grad_norm": 10.132364273071289,
642
- "learning_rate": 2.8298989595594986e-05,
643
- "loss": 0.553,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 3.006993006993007,
648
- "grad_norm": 6.798361778259277,
649
- "learning_rate": 2.8254233701540112e-05,
650
- "loss": 0.374,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 3.041958041958042,
655
- "grad_norm": 20.141143798828125,
656
- "learning_rate": 2.820893297785105e-05,
657
- "loss": 0.3313,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 3.076923076923077,
662
- "grad_norm": 11.781057357788086,
663
- "learning_rate": 2.8163089286663127e-05,
664
- "loss": 0.415,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 3.111888111888112,
669
- "grad_norm": 6.8686604499816895,
670
- "learning_rate": 2.8116704512430917e-05,
671
- "loss": 0.3106,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 3.1468531468531467,
676
- "grad_norm": 6.8011674880981445,
677
- "learning_rate": 2.8069780561850824e-05,
678
- "loss": 0.4021,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 3.1468531468531467,
683
- "eval_loss": 0.6804107427597046,
684
- "eval_runtime": 5.482,
685
- "eval_samples_per_second": 11.675,
686
- "eval_steps_per_second": 11.675,
687
- "step": 900
688
- },
689
- {
690
- "epoch": 3.1818181818181817,
691
- "grad_norm": 10.609955787658691,
692
- "learning_rate": 2.802231936378266e-05,
693
- "loss": 0.4309,
694
- "step": 910
695
- },
696
- {
697
- "epoch": 3.2167832167832167,
698
- "grad_norm": 12.568284034729004,
699
- "learning_rate": 2.7974322869170385e-05,
700
- "loss": 0.5405,
701
- "step": 920
702
- },
703
- {
704
- "epoch": 3.2517482517482517,
705
- "grad_norm": 10.24028205871582,
706
- "learning_rate": 2.7925793050961907e-05,
707
- "loss": 0.4015,
708
- "step": 930
709
- },
710
- {
711
- "epoch": 3.2867132867132867,
712
- "grad_norm": 26.025440216064453,
713
- "learning_rate": 2.7876731904027973e-05,
714
- "loss": 0.4744,
715
- "step": 940
716
- },
717
- {
718
- "epoch": 3.3216783216783217,
719
- "grad_norm": 5.4463582038879395,
720
- "learning_rate": 2.7827141445080176e-05,
721
- "loss": 0.3603,
722
- "step": 950
723
- },
724
- {
725
- "epoch": 3.3566433566433567,
726
- "grad_norm": 17.8033504486084,
727
- "learning_rate": 2.7777023712588047e-05,
728
- "loss": 0.3492,
729
- "step": 960
730
- },
731
- {
732
- "epoch": 3.3916083916083917,
733
- "grad_norm": 14.276956558227539,
734
- "learning_rate": 2.772638076669527e-05,
735
- "loss": 0.4004,
736
- "step": 970
737
- },
738
- {
739
- "epoch": 3.4265734265734267,
740
- "grad_norm": 13.117759704589844,
741
- "learning_rate": 2.7675214689135002e-05,
742
- "loss": 0.4456,
743
- "step": 980
744
- },
745
- {
746
- "epoch": 3.4615384615384617,
747
- "grad_norm": 15.448287010192871,
748
- "learning_rate": 2.762352758314428e-05,
749
- "loss": 0.3232,
750
- "step": 990
751
- },
752
- {
753
- "epoch": 3.4965034965034967,
754
- "grad_norm": 13.702083587646484,
755
- "learning_rate": 2.757132157337759e-05,
756
- "loss": 0.4483,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 3.5314685314685317,
761
- "grad_norm": 11.251469612121582,
762
- "learning_rate": 2.751859880581952e-05,
763
- "loss": 0.3263,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 3.5664335664335667,
768
- "grad_norm": 17.5662841796875,
769
- "learning_rate": 2.746536144769654e-05,
770
- "loss": 0.4312,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 3.6013986013986012,
775
- "grad_norm": 15.075780868530273,
776
- "learning_rate": 2.741161168738793e-05,
777
- "loss": 0.4887,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 3.6363636363636362,
782
- "grad_norm": 13.685576438903809,
783
- "learning_rate": 2.7357351734335802e-05,
784
- "loss": 0.2859,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 3.6713286713286712,
789
- "grad_norm": 8.514018058776855,
790
- "learning_rate": 2.730258381895432e-05,
791
- "loss": 0.2317,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 3.6713286713286712,
796
- "eval_loss": 0.682501494884491,
797
- "eval_runtime": 5.4306,
798
- "eval_samples_per_second": 11.785,
799
- "eval_steps_per_second": 11.785,
800
- "step": 1050
801
- },
802
- {
803
- "epoch": 3.7062937062937062,
804
- "grad_norm": 12.129782676696777,
805
- "learning_rate": 1.365129190947716e-06,
806
- "loss": 0.447,
807
- "step": 1060
808
- },
809
- {
810
- "epoch": 3.7412587412587412,
811
- "grad_norm": 16.233694076538086,
812
- "learning_rate": 2.730258381895432e-06,
813
- "loss": 0.3503,
814
- "step": 1070
815
- },
816
- {
817
- "epoch": 3.7762237762237763,
818
- "grad_norm": 20.50456428527832,
819
- "learning_rate": 4.095387572843148e-06,
820
- "loss": 0.3355,
821
- "step": 1080
822
- },
823
- {
824
- "epoch": 3.8111888111888113,
825
- "grad_norm": 5.729248523712158,
826
- "learning_rate": 5.460516763790864e-06,
827
- "loss": 0.4045,
828
- "step": 1090
829
- },
830
- {
831
- "epoch": 3.8461538461538463,
832
- "grad_norm": 20.16579818725586,
833
- "learning_rate": 6.82564595473858e-06,
834
- "loss": 0.4228,
835
- "step": 1100
836
- },
837
- {
838
- "epoch": 3.8811188811188813,
839
- "grad_norm": 8.82602596282959,
840
- "learning_rate": 8.190775145686295e-06,
841
- "loss": 0.3125,
842
- "step": 1110
843
- },
844
- {
845
- "epoch": 3.916083916083916,
846
- "grad_norm": 12.37415885925293,
847
- "learning_rate": 9.555904336634011e-06,
848
- "loss": 0.4757,
849
- "step": 1120
850
- },
851
- {
852
- "epoch": 3.951048951048951,
853
- "grad_norm": 9.278702735900879,
854
- "learning_rate": 1.0921033527581728e-05,
855
- "loss": 0.2989,
856
- "step": 1130
857
- },
858
- {
859
- "epoch": 3.986013986013986,
860
- "grad_norm": 9.90930461883545,
861
- "learning_rate": 1.2286162718529444e-05,
862
- "loss": 0.3974,
863
- "step": 1140
864
- },
865
- {
866
- "epoch": 4.020979020979021,
867
- "grad_norm": 10.056645393371582,
868
- "learning_rate": 1.365129190947716e-05,
869
- "loss": 0.3608,
870
- "step": 1150
871
- },
872
- {
873
- "epoch": 4.055944055944056,
874
- "grad_norm": 7.33254337310791,
875
- "learning_rate": 1.3651151621691673e-05,
876
- "loss": 0.339,
877
- "step": 1160
878
- },
879
- {
880
- "epoch": 4.090909090909091,
881
- "grad_norm": 24.452014923095703,
882
- "learning_rate": 1.3650730764101896e-05,
883
- "loss": 0.3583,
884
- "step": 1170
885
- },
886
- {
887
- "epoch": 4.125874125874126,
888
- "grad_norm": 7.368541240692139,
889
- "learning_rate": 1.3650029354007634e-05,
890
- "loss": 0.2664,
891
- "step": 1180
892
- },
893
- {
894
- "epoch": 4.160839160839161,
895
- "grad_norm": 22.197839736938477,
896
- "learning_rate": 1.364904742024111e-05,
897
- "loss": 0.4509,
898
- "step": 1190
899
- },
900
- {
901
- "epoch": 4.195804195804196,
902
- "grad_norm": 9.575596809387207,
903
- "learning_rate": 1.3647785003165774e-05,
904
- "loss": 0.3067,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 4.195804195804196,
909
- "eval_loss": 0.6809002757072449,
910
- "eval_runtime": 5.4517,
911
- "eval_samples_per_second": 11.74,
912
- "eval_steps_per_second": 11.74,
913
- "step": 1200
914
- },
915
- {
916
- "epoch": 4.230769230769231,
917
- "grad_norm": 10.897104263305664,
918
- "learning_rate": 1.364624215467465e-05,
919
- "loss": 0.4448,
920
- "step": 1210
921
- },
922
- {
923
- "epoch": 4.265734265734266,
924
- "grad_norm": 10.724977493286133,
925
- "learning_rate": 1.3644418938188194e-05,
926
- "loss": 0.2332,
927
- "step": 1220
928
- },
929
- {
930
- "epoch": 4.300699300699301,
931
- "grad_norm": 19.249921798706055,
932
- "learning_rate": 1.3642315428651695e-05,
933
- "loss": 0.314,
934
- "step": 1230
935
- },
936
- {
937
- "epoch": 4.335664335664336,
938
- "grad_norm": 5.301750183105469,
939
- "learning_rate": 1.363993171253219e-05,
940
- "loss": 0.3981,
941
- "step": 1240
942
- },
943
- {
944
- "epoch": 4.370629370629371,
945
- "grad_norm": 7.834229946136475,
946
- "learning_rate": 1.3637267887814916e-05,
947
- "loss": 0.3859,
948
- "step": 1250
949
- },
950
- {
951
- "epoch": 4.405594405594406,
952
- "grad_norm": 9.660653114318848,
953
- "learning_rate": 1.3634324063999272e-05,
954
- "loss": 0.3121,
955
- "step": 1260
956
- },
957
- {
958
- "epoch": 4.440559440559441,
959
- "grad_norm": 10.212327003479004,
960
- "learning_rate": 1.3631100362094322e-05,
961
- "loss": 0.3114,
962
- "step": 1270
963
- },
964
- {
965
- "epoch": 4.475524475524476,
966
- "grad_norm": 7.464266300201416,
967
- "learning_rate": 1.362759691461383e-05,
968
- "loss": 0.297,
969
- "step": 1280
970
- },
971
- {
972
- "epoch": 4.510489510489511,
973
- "grad_norm": 10.588373184204102,
974
- "learning_rate": 1.3623813865570797e-05,
975
- "loss": 0.2884,
976
- "step": 1290
977
- },
978
- {
979
- "epoch": 4.545454545454545,
980
- "grad_norm": 5.730044364929199,
981
- "learning_rate": 1.3619751370471551e-05,
982
- "loss": 0.3567,
983
- "step": 1300
984
- },
985
- {
986
- "epoch": 4.58041958041958,
987
- "grad_norm": 19.52901268005371,
988
- "learning_rate": 1.3615409596309355e-05,
989
- "loss": 0.2881,
990
- "step": 1310
991
- },
992
- {
993
- "epoch": 4.615384615384615,
994
- "grad_norm": 3.7794055938720703,
995
- "learning_rate": 1.361078872155754e-05,
996
- "loss": 0.3635,
997
- "step": 1320
998
- },
999
- {
1000
- "epoch": 4.65034965034965,
1001
- "grad_norm": 4.951632499694824,
1002
- "learning_rate": 1.3605888936162167e-05,
1003
- "loss": 0.3109,
1004
- "step": 1330
1005
- },
1006
- {
1007
- "epoch": 4.685314685314685,
1008
- "grad_norm": 10.017960548400879,
1009
- "learning_rate": 1.3600710441534224e-05,
1010
- "loss": 0.2436,
1011
- "step": 1340
1012
- },
1013
- {
1014
- "epoch": 4.72027972027972,
1015
- "grad_norm": 16.66628646850586,
1016
- "learning_rate": 1.359525345054134e-05,
1017
- "loss": 0.3864,
1018
- "step": 1350
1019
- },
1020
- {
1021
- "epoch": 4.72027972027972,
1022
- "eval_loss": 0.6838909983634949,
1023
- "eval_runtime": 5.4678,
1024
- "eval_samples_per_second": 11.705,
1025
- "eval_steps_per_second": 11.705,
1026
- "step": 1350
1027
- },
1028
- {
1029
- "epoch": 4.755244755244755,
1030
- "grad_norm": 9.737703323364258,
1031
- "learning_rate": 1.3589518187499044e-05,
1032
- "loss": 0.3225,
1033
- "step": 1360
1034
- },
1035
- {
1036
- "epoch": 4.79020979020979,
1037
- "grad_norm": 15.640142440795898,
1038
- "learning_rate": 1.3583504888161534e-05,
1039
- "loss": 0.2717,
1040
- "step": 1370
1041
- },
1042
- {
1043
- "epoch": 4.825174825174825,
1044
- "grad_norm": 10.82050895690918,
1045
- "learning_rate": 1.3577213799711993e-05,
1046
- "loss": 0.3008,
1047
- "step": 1380
1048
- },
1049
- {
1050
- "epoch": 4.86013986013986,
1051
- "grad_norm": 15.912845611572266,
1052
- "learning_rate": 1.3570645180752428e-05,
1053
- "loss": 0.2563,
1054
- "step": 1390
1055
- },
1056
- {
1057
- "epoch": 4.895104895104895,
1058
- "grad_norm": 7.422889232635498,
1059
- "learning_rate": 1.3563799301293039e-05,
1060
- "loss": 0.3438,
1061
- "step": 1400
1062
- },
1063
- {
1064
- "epoch": 4.93006993006993,
1065
- "grad_norm": 10.84676742553711,
1066
- "learning_rate": 1.355667644274111e-05,
1067
- "loss": 0.2496,
1068
- "step": 1410
1069
- },
1070
- {
1071
- "epoch": 4.965034965034965,
1072
- "grad_norm": 11.794937133789062,
1073
- "learning_rate": 1.3549276897889468e-05,
1074
- "loss": 0.4524,
1075
- "step": 1420
1076
- },
1077
- {
1078
- "epoch": 5.0,
1079
- "grad_norm": 10.1076078414917,
1080
- "learning_rate": 1.3541600970904412e-05,
1081
- "loss": 0.3511,
1082
- "step": 1430
1083
- },
1084
- {
1085
- "epoch": 5.034965034965035,
1086
- "grad_norm": 6.803938865661621,
1087
- "learning_rate": 1.3533648977313237e-05,
1088
- "loss": 0.2687,
1089
- "step": 1440
1090
- },
1091
- {
1092
- "epoch": 5.06993006993007,
1093
- "grad_norm": 7.840601444244385,
1094
- "learning_rate": 1.3525421243991256e-05,
1095
- "loss": 0.373,
1096
- "step": 1450
1097
- },
1098
- {
1099
- "epoch": 5.104895104895105,
1100
- "grad_norm": 22.30022621154785,
1101
- "learning_rate": 1.3516918109148357e-05,
1102
- "loss": 0.3071,
1103
- "step": 1460
1104
- },
1105
- {
1106
- "epoch": 5.13986013986014,
1107
- "grad_norm": 17.0526180267334,
1108
- "learning_rate": 1.3508139922315113e-05,
1109
- "loss": 0.3202,
1110
- "step": 1470
1111
- },
1112
- {
1113
- "epoch": 5.174825174825175,
1114
- "grad_norm": 6.399594306945801,
1115
- "learning_rate": 1.34990870443284e-05,
1116
- "loss": 0.1617,
1117
- "step": 1480
1118
- },
1119
- {
1120
- "epoch": 5.20979020979021,
1121
- "grad_norm": 13.202977180480957,
1122
- "learning_rate": 1.3489759847316573e-05,
1123
- "loss": 0.3732,
1124
- "step": 1490
1125
- },
1126
- {
1127
- "epoch": 5.244755244755245,
1128
- "grad_norm": 17.730331420898438,
1129
- "learning_rate": 1.3480158714684173e-05,
1130
- "loss": 0.3192,
1131
- "step": 1500
1132
- },
1133
- {
1134
- "epoch": 5.244755244755245,
1135
- "eval_loss": 0.6989673972129822,
1136
- "eval_runtime": 5.5352,
1137
- "eval_samples_per_second": 11.562,
1138
- "eval_steps_per_second": 11.562,
1139
- "step": 1500
1140
- },
1141
- {
1142
- "epoch": 5.27972027972028,
1143
- "grad_norm": 52.83620834350586,
1144
- "learning_rate": 6.740079357342087e-07,
1145
- "loss": 0.4347,
1146
- "step": 1510
1147
- },
1148
- {
1149
- "epoch": 5.314685314685315,
1150
- "grad_norm": 13.815258979797363,
1151
- "learning_rate": 1.3480158714684174e-06,
1152
- "loss": 0.2398,
1153
- "step": 1520
1154
- },
1155
- {
1156
- "epoch": 5.34965034965035,
1157
- "grad_norm": 9.805350303649902,
1158
- "learning_rate": 2.022023807202626e-06,
1159
- "loss": 0.3022,
1160
- "step": 1530
1161
- },
1162
- {
1163
- "epoch": 5.384615384615385,
1164
- "grad_norm": 12.087991714477539,
1165
- "learning_rate": 2.696031742936835e-06,
1166
- "loss": 0.2382,
1167
- "step": 1540
1168
- },
1169
- {
1170
- "epoch": 5.41958041958042,
1171
- "grad_norm": 8.947698593139648,
1172
- "learning_rate": 3.3700396786710433e-06,
1173
- "loss": 0.2632,
1174
- "step": 1550
1175
- },
1176
- {
1177
- "epoch": 5.454545454545454,
1178
- "grad_norm": 13.196881294250488,
1179
- "learning_rate": 4.044047614405252e-06,
1180
- "loss": 0.3315,
1181
- "step": 1560
1182
- },
1183
- {
1184
- "epoch": 5.489510489510489,
1185
- "grad_norm": 9.41542911529541,
1186
- "learning_rate": 4.71805555013946e-06,
1187
- "loss": 0.2862,
1188
- "step": 1570
1189
- },
1190
- {
1191
- "epoch": 5.524475524475524,
1192
- "grad_norm": 13.778592109680176,
1193
- "learning_rate": 5.39206348587367e-06,
1194
- "loss": 0.2527,
1195
- "step": 1580
1196
- },
1197
- {
1198
- "epoch": 5.559440559440559,
1199
- "grad_norm": 5.95212984085083,
1200
- "learning_rate": 6.066071421607878e-06,
1201
- "loss": 0.3325,
1202
- "step": 1590
1203
- },
1204
- {
1205
- "epoch": 5.594405594405594,
1206
- "grad_norm": 14.649141311645508,
1207
- "learning_rate": 6.7400793573420866e-06,
1208
- "loss": 0.3249,
1209
- "step": 1600
1210
- },
1211
- {
1212
- "epoch": 5.629370629370629,
1213
- "grad_norm": 14.40283489227295,
1214
- "learning_rate": 6.740010092776263e-06,
1215
- "loss": 0.3323,
1216
- "step": 1610
1217
- },
1218
- {
1219
- "epoch": 5.664335664335664,
1220
- "grad_norm": 24.079910278320312,
1221
- "learning_rate": 6.739802301925987e-06,
1222
- "loss": 0.3388,
1223
- "step": 1620
1224
- },
1225
- {
1226
- "epoch": 5.699300699300699,
1227
- "grad_norm": 8.163803100585938,
1228
- "learning_rate": 6.739455993332726e-06,
1229
- "loss": 0.2671,
1230
- "step": 1630
1231
- },
1232
- {
1233
- "epoch": 5.734265734265734,
1234
- "grad_norm": 6.428587913513184,
1235
- "learning_rate": 6.73897118123187e-06,
1236
- "loss": 0.2697,
1237
- "step": 1640
1238
- },
1239
- {
1240
- "epoch": 5.769230769230769,
1241
- "grad_norm": 13.600237846374512,
1242
- "learning_rate": 6.738347885552146e-06,
1243
- "loss": 0.2119,
1244
- "step": 1650
1245
- },
1246
- {
1247
- "epoch": 5.769230769230769,
1248
- "eval_loss": 0.6929482817649841,
1249
- "eval_runtime": 5.408,
1250
- "eval_samples_per_second": 11.834,
1251
- "eval_steps_per_second": 11.834,
1252
- "step": 1650
1253
- },
1254
- {
1255
- "epoch": 5.804195804195804,
1256
- "grad_norm": 15.782163619995117,
1257
- "learning_rate": 6.737586131914798e-06,
1258
- "loss": 0.2109,
1259
- "step": 1660
1260
- },
1261
- {
1262
- "epoch": 5.839160839160839,
1263
- "grad_norm": 5.466376781463623,
1264
- "learning_rate": 6.736685951632536e-06,
1265
- "loss": 0.2627,
1266
- "step": 1670
1267
- },
1268
- {
1269
- "epoch": 5.874125874125874,
1270
- "grad_norm": 20.662796020507812,
1271
- "learning_rate": 6.7356473817082425e-06,
1272
- "loss": 0.3293,
1273
- "step": 1680
1274
- },
1275
- {
1276
- "epoch": 5.909090909090909,
1277
- "grad_norm": 10.718391418457031,
1278
- "learning_rate": 6.734470464833461e-06,
1279
- "loss": 0.2461,
1280
- "step": 1690
1281
- },
1282
- {
1283
- "epoch": 5.944055944055944,
1284
- "grad_norm": 20.616153717041016,
1285
- "learning_rate": 6.733155249386635e-06,
1286
- "loss": 0.2833,
1287
- "step": 1700
1288
- },
1289
- {
1290
- "epoch": 5.979020979020979,
1291
- "grad_norm": 11.193328857421875,
1292
- "learning_rate": 6.731701789431119e-06,
1293
- "loss": 0.3295,
1294
- "step": 1710
1295
- },
1296
- {
1297
- "epoch": 6.013986013986014,
1298
- "grad_norm": 17.947956085205078,
1299
- "learning_rate": 6.73011014471296e-06,
1300
- "loss": 0.3172,
1301
- "step": 1720
1302
- },
1303
- {
1304
- "epoch": 6.048951048951049,
1305
- "grad_norm": 22.818161010742188,
1306
- "learning_rate": 6.728380380658438e-06,
1307
- "loss": 0.3279,
1308
- "step": 1730
1309
- },
1310
- {
1311
- "epoch": 6.083916083916084,
1312
- "grad_norm": 7.237072944641113,
1313
- "learning_rate": 6.726512568371378e-06,
1314
- "loss": 0.2883,
1315
- "step": 1740
1316
- },
1317
- {
1318
- "epoch": 6.118881118881119,
1319
- "grad_norm": 11.877559661865234,
1320
- "learning_rate": 6.724506784630227e-06,
1321
- "loss": 0.308,
1322
- "step": 1750
1323
- },
1324
- {
1325
- "epoch": 6.153846153846154,
1326
- "grad_norm": 7.487149715423584,
1327
- "learning_rate": 6.7223631118849e-06,
1328
- "loss": 0.2349,
1329
- "step": 1760
1330
- },
1331
- {
1332
- "epoch": 6.188811188811189,
1333
- "grad_norm": 8.094034194946289,
1334
- "learning_rate": 6.720081638253386e-06,
1335
- "loss": 0.3345,
1336
- "step": 1770
1337
- },
1338
- {
1339
- "epoch": 6.223776223776224,
1340
- "grad_norm": 9.092995643615723,
1341
- "learning_rate": 6.717662457518131e-06,
1342
- "loss": 0.2765,
1343
- "step": 1780
1344
- },
1345
- {
1346
- "epoch": 6.258741258741258,
1347
- "grad_norm": 12.7426176071167,
1348
- "learning_rate": 6.715105669122178e-06,
1349
- "loss": 0.219,
1350
- "step": 1790
1351
- },
1352
- {
1353
- "epoch": 6.293706293706293,
1354
- "grad_norm": 12.268411636352539,
1355
- "learning_rate": 6.712411378165085e-06,
1356
- "loss": 0.245,
1357
- "step": 1800
1358
- },
1359
- {
1360
- "epoch": 6.293706293706293,
1361
- "eval_loss": 0.6965098977088928,
1362
- "eval_runtime": 5.433,
1363
- "eval_samples_per_second": 11.78,
1364
- "eval_steps_per_second": 11.78,
1365
- "step": 1800
1366
- },
1367
- {
1368
- "epoch": 6.328671328671328,
1369
- "grad_norm": 7.888387680053711,
1370
- "learning_rate": 6.709579695398601e-06,
1371
- "loss": 0.2562,
1372
- "step": 1810
1373
- },
1374
- {
1375
- "epoch": 6.363636363636363,
1376
- "grad_norm": 7.875286102294922,
1377
- "learning_rate": 6.706610737222113e-06,
1378
- "loss": 0.2944,
1379
- "step": 1820
1380
- },
1381
- {
1382
- "epoch": 6.398601398601398,
1383
- "grad_norm": 7.031346797943115,
1384
- "learning_rate": 6.703504625677862e-06,
1385
- "loss": 0.2429,
1386
- "step": 1830
1387
- },
1388
- {
1389
- "epoch": 6.433566433566433,
1390
- "grad_norm": 6.460575103759766,
1391
- "learning_rate": 6.70026148844593e-06,
1392
- "loss": 0.258,
1393
- "step": 1840
1394
- },
1395
- {
1396
- "epoch": 6.468531468531468,
1397
- "grad_norm": 14.718223571777344,
1398
- "learning_rate": 6.696881458838985e-06,
1399
- "loss": 0.3136,
1400
- "step": 1850
1401
- },
1402
- {
1403
- "epoch": 6.503496503496503,
1404
- "grad_norm": 9.787906646728516,
1405
- "learning_rate": 6.693364675796803e-06,
1406
- "loss": 0.2644,
1407
- "step": 1860
1408
- },
1409
- {
1410
- "epoch": 6.538461538461538,
1411
- "grad_norm": 15.591919898986816,
1412
- "learning_rate": 6.689711283880565e-06,
1413
- "loss": 0.322,
1414
- "step": 1870
1415
- },
1416
- {
1417
- "epoch": 6.573426573426573,
1418
- "grad_norm": 9.265682220458984,
1419
- "learning_rate": 6.685921433266901e-06,
1420
- "loss": 0.3452,
1421
- "step": 1880
1422
- },
1423
- {
1424
- "epoch": 6.608391608391608,
1425
- "grad_norm": 11.823332786560059,
1426
- "learning_rate": 6.681995279741726e-06,
1427
- "loss": 0.301,
1428
- "step": 1890
1429
- },
1430
- {
1431
- "epoch": 6.643356643356643,
1432
- "grad_norm": 40.379150390625,
1433
- "learning_rate": 6.677932984693833e-06,
1434
- "loss": 0.2843,
1435
- "step": 1900
1436
- },
1437
- {
1438
- "epoch": 6.678321678321678,
1439
- "grad_norm": 14.873003005981445,
1440
- "learning_rate": 6.673734715108263e-06,
1441
- "loss": 0.2179,
1442
- "step": 1910
1443
- },
1444
- {
1445
- "epoch": 6.713286713286713,
1446
- "grad_norm": 10.106959342956543,
1447
- "learning_rate": 6.669400643559431e-06,
1448
- "loss": 0.2102,
1449
- "step": 1920
1450
- },
1451
- {
1452
- "epoch": 6.748251748251748,
1453
- "grad_norm": 13.115974426269531,
1454
- "learning_rate": 6.664930948204048e-06,
1455
- "loss": 0.2349,
1456
- "step": 1930
1457
- },
1458
- {
1459
- "epoch": 6.783216783216783,
1460
- "grad_norm": 10.651406288146973,
1461
- "learning_rate": 6.660325812773779e-06,
1462
- "loss": 0.3099,
1463
- "step": 1940
1464
- },
1465
- {
1466
- "epoch": 6.818181818181818,
1467
- "grad_norm": 20.88200569152832,
1468
- "learning_rate": 6.655585426567707e-06,
1469
- "loss": 0.2866,
1470
- "step": 1950
1471
- },
1472
- {
1473
- "epoch": 6.818181818181818,
1474
- "eval_loss": 0.6974838972091675,
1475
- "eval_runtime": 5.4412,
1476
- "eval_samples_per_second": 11.762,
1477
- "eval_steps_per_second": 11.762,
1478
- "step": 1950
1479
- },
1480
- {
1481
- "epoch": 6.853146853146853,
1482
- "grad_norm": 11.113102912902832,
1483
- "learning_rate": 3.3277927132838535e-07,
1484
- "loss": 0.2691,
1485
- "step": 1960
1486
- },
1487
- {
1488
- "epoch": 6.888111888111888,
1489
- "grad_norm": 12.720255851745605,
1490
- "learning_rate": 6.655585426567707e-07,
1491
- "loss": 0.2406,
1492
- "step": 1970
1493
- },
1494
- {
1495
- "epoch": 6.923076923076923,
1496
- "grad_norm": 5.015578269958496,
1497
- "learning_rate": 9.98337813985156e-07,
1498
- "loss": 0.2339,
1499
- "step": 1980
1500
- },
1501
- {
1502
- "epoch": 6.958041958041958,
1503
- "grad_norm": 14.693328857421875,
1504
- "learning_rate": 1.3311170853135414e-06,
1505
- "loss": 0.2724,
1506
- "step": 1990
1507
- },
1508
- {
1509
- "epoch": 6.993006993006993,
1510
- "grad_norm": 14.348461151123047,
1511
- "learning_rate": 1.6638963566419267e-06,
1512
- "loss": 0.2537,
1513
- "step": 2000
1514
- },
1515
- {
1516
- "epoch": 7.027972027972028,
1517
- "grad_norm": 9.5806303024292,
1518
- "learning_rate": 1.996675627970312e-06,
1519
- "loss": 0.2226,
1520
- "step": 2010
1521
- },
1522
- {
1523
- "epoch": 7.062937062937063,
1524
- "grad_norm": 11.463421821594238,
1525
- "learning_rate": 2.329454899298697e-06,
1526
- "loss": 0.264,
1527
- "step": 2020
1528
- },
1529
- {
1530
- "epoch": 7.0979020979020975,
1531
- "grad_norm": 16.207624435424805,
1532
- "learning_rate": 2.662234170627083e-06,
1533
- "loss": 0.2492,
1534
- "step": 2030
1535
- },
1536
- {
1537
- "epoch": 7.1328671328671325,
1538
- "grad_norm": 11.598509788513184,
1539
- "learning_rate": 2.995013441955468e-06,
1540
- "loss": 0.3486,
1541
- "step": 2040
1542
- },
1543
- {
1544
- "epoch": 7.1678321678321675,
1545
- "grad_norm": 11.6201810836792,
1546
- "learning_rate": 3.3277927132838533e-06,
1547
- "loss": 0.2353,
1548
- "step": 2050
1549
- },
1550
- {
1551
- "epoch": 7.2027972027972025,
1552
- "grad_norm": 18.30818748474121,
1553
- "learning_rate": 3.327758515152761e-06,
1554
- "loss": 0.1996,
1555
- "step": 2060
1556
- },
1557
- {
1558
- "epoch": 7.2377622377622375,
1559
- "grad_norm": 16.372011184692383,
1560
- "learning_rate": 3.327655922165234e-06,
1561
- "loss": 0.2892,
1562
- "step": 2070
1563
- },
1564
- {
1565
- "epoch": 7.2727272727272725,
1566
- "grad_norm": 13.199930191040039,
1567
- "learning_rate": 3.327484938538469e-06,
1568
- "loss": 0.3681,
1569
- "step": 2080
1570
- },
1571
- {
1572
- "epoch": 7.3076923076923075,
1573
- "grad_norm": 14.532537460327148,
1574
- "learning_rate": 3.3272455713009334e-06,
1575
- "loss": 0.1764,
1576
- "step": 2090
1577
- },
1578
- {
1579
- "epoch": 7.3426573426573425,
1580
- "grad_norm": 10.235544204711914,
1581
- "learning_rate": 3.326937830292076e-06,
1582
- "loss": 0.3104,
1583
- "step": 2100
1584
- },
1585
- {
1586
- "epoch": 7.3426573426573425,
1587
- "eval_loss": 0.7017927765846252,
1588
- "eval_runtime": 5.43,
1589
- "eval_samples_per_second": 11.786,
1590
- "eval_steps_per_second": 11.786,
1591
- "step": 2100
1592
- },
1593
- {
1594
- "epoch": 7.3776223776223775,
1595
- "grad_norm": 11.722810745239258,
1596
- "learning_rate": 3.3265617281619253e-06,
1597
- "loss": 0.3426,
1598
- "step": 2110
1599
- },
1600
- {
1601
- "epoch": 7.4125874125874125,
1602
- "grad_norm": 10.100516319274902,
1603
- "learning_rate": 3.326117280370566e-06,
1604
- "loss": 0.2169,
1605
- "step": 2120
1606
- },
1607
- {
1608
- "epoch": 7.4475524475524475,
1609
- "grad_norm": 9.289050102233887,
1610
- "learning_rate": 3.325604505187506e-06,
1611
- "loss": 0.2244,
1612
- "step": 2130
1613
- },
1614
- {
1615
- "epoch": 7.4825174825174825,
1616
- "grad_norm": 4.566373825073242,
1617
- "learning_rate": 3.3250234236909255e-06,
1618
- "loss": 0.1558,
1619
- "step": 2140
1620
- },
1621
- {
1622
- "epoch": 7.5174825174825175,
1623
- "grad_norm": 15.955838203430176,
1624
- "learning_rate": 3.324374059766808e-06,
1625
- "loss": 0.2828,
1626
- "step": 2150
1627
- },
1628
- {
1629
- "epoch": 7.5524475524475525,
1630
- "grad_norm": 2.7527740001678467,
1631
- "learning_rate": 3.3236564401079615e-06,
1632
- "loss": 0.2772,
1633
- "step": 2160
1634
- },
1635
- {
1636
- "epoch": 7.5874125874125875,
1637
- "grad_norm": 11.847249031066895,
1638
- "learning_rate": 3.322870594212919e-06,
1639
- "loss": 0.2531,
1640
- "step": 2170
1641
- },
1642
- {
1643
- "epoch": 7.6223776223776225,
1644
- "grad_norm": 11.005929946899414,
1645
- "learning_rate": 3.3220165543847277e-06,
1646
- "loss": 0.3826,
1647
- "step": 2180
1648
- },
1649
- {
1650
- "epoch": 7.6573426573426575,
1651
- "grad_norm": 8.029391288757324,
1652
- "learning_rate": 3.3210943557296204e-06,
1653
- "loss": 0.2823,
1654
- "step": 2190
1655
- },
1656
- {
1657
- "epoch": 7.6923076923076925,
1658
- "grad_norm": 7.657909870147705,
1659
- "learning_rate": 3.3201040361555703e-06,
1660
- "loss": 0.23,
1661
- "step": 2200
1662
- },
1663
- {
1664
- "epoch": 7.7272727272727275,
1665
- "grad_norm": 15.844249725341797,
1666
- "learning_rate": 3.3190456363707377e-06,
1667
- "loss": 0.258,
1668
- "step": 2210
1669
- },
1670
- {
1671
- "epoch": 7.7622377622377625,
1672
- "grad_norm": 8.142477035522461,
1673
- "learning_rate": 3.3179191998817917e-06,
1674
- "loss": 0.2407,
1675
- "step": 2220
1676
- },
1677
- {
1678
- "epoch": 7.7972027972027975,
1679
- "grad_norm": 18.25770378112793,
1680
- "learning_rate": 3.3167247729921246e-06,
1681
- "loss": 0.2807,
1682
- "step": 2230
1683
- },
1684
- {
1685
- "epoch": 7.8321678321678325,
1686
- "grad_norm": 13.97754192352295,
1687
- "learning_rate": 3.315462404799947e-06,
1688
- "loss": 0.2449,
1689
- "step": 2240
1690
- },
1691
- {
1692
- "epoch": 7.867132867132867,
1693
- "grad_norm": 12.67829418182373,
1694
- "learning_rate": 3.314132147196272e-06,
1695
- "loss": 0.2594,
1696
- "step": 2250
1697
- },
1698
- {
1699
- "epoch": 7.867132867132867,
1700
- "eval_loss": 0.7046529054641724,
1701
- "eval_runtime": 5.4467,
1702
- "eval_samples_per_second": 11.75,
1703
- "eval_steps_per_second": 11.75,
1704
- "step": 2250
1705
- },
1706
- {
1707
- "epoch": 7.902097902097902,
1708
- "grad_norm": 4.242758750915527,
1709
- "learning_rate": 1.657066073598136e-07,
1710
- "loss": 0.2084,
1711
- "step": 2260
1712
- },
1713
- {
1714
- "epoch": 7.937062937062937,
1715
- "grad_norm": 9.143125534057617,
1716
- "learning_rate": 3.314132147196272e-07,
1717
- "loss": 0.1923,
1718
- "step": 2270
1719
- },
1720
- {
1721
- "epoch": 7.972027972027972,
1722
- "grad_norm": 10.585535049438477,
1723
- "learning_rate": 4.971198220794408e-07,
1724
- "loss": 0.204,
1725
- "step": 2280
1726
- },
1727
- {
1728
- "epoch": 8.006993006993007,
1729
- "grad_norm": 10.469385147094727,
1730
- "learning_rate": 6.628264294392544e-07,
1731
- "loss": 0.3009,
1732
- "step": 2290
1733
- },
1734
- {
1735
- "epoch": 8.041958041958042,
1736
- "grad_norm": 7.694136142730713,
1737
- "learning_rate": 8.28533036799068e-07,
1738
- "loss": 0.2215,
1739
- "step": 2300
1740
- },
1741
- {
1742
- "epoch": 8.076923076923077,
1743
- "grad_norm": 7.215337753295898,
1744
- "learning_rate": 9.942396441588815e-07,
1745
- "loss": 0.1923,
1746
- "step": 2310
1747
- },
1748
- {
1749
- "epoch": 8.111888111888112,
1750
- "grad_norm": 15.670652389526367,
1751
- "learning_rate": 1.159946251518695e-06,
1752
- "loss": 0.2335,
1753
- "step": 2320
1754
- },
1755
- {
1756
- "epoch": 8.146853146853147,
1757
- "grad_norm": 12.610808372497559,
1758
- "learning_rate": 1.3256528588785089e-06,
1759
- "loss": 0.3092,
1760
- "step": 2330
1761
- },
1762
- {
1763
- "epoch": 8.181818181818182,
1764
- "grad_norm": 20.136999130249023,
1765
- "learning_rate": 1.4913594662383224e-06,
1766
- "loss": 0.2475,
1767
- "step": 2340
1768
- },
1769
- {
1770
- "epoch": 8.216783216783217,
1771
- "grad_norm": 7.424393177032471,
1772
- "learning_rate": 1.657066073598136e-06,
1773
- "loss": 0.183,
1774
- "step": 2350
1775
- },
1776
- {
1777
- "epoch": 8.251748251748252,
1778
- "grad_norm": 12.980989456176758,
1779
- "learning_rate": 1.6570490447241355e-06,
1780
- "loss": 0.2455,
1781
- "step": 2360
1782
- },
1783
- {
1784
- "epoch": 8.286713286713287,
1785
- "grad_norm": 13.760506629943848,
1786
- "learning_rate": 1.6569979588021245e-06,
1787
- "loss": 0.2709,
1788
- "step": 2370
1789
- },
1790
- {
1791
- "epoch": 8.321678321678322,
1792
- "grad_norm": 6.805473804473877,
1793
- "learning_rate": 1.6569128179320452e-06,
1794
- "loss": 0.2012,
1795
- "step": 2380
1796
- },
1797
- {
1798
- "epoch": 8.356643356643357,
1799
- "grad_norm": 6.797975063323975,
1800
- "learning_rate": 1.656793625613705e-06,
1801
- "loss": 0.208,
1802
- "step": 2390
1803
- },
1804
- {
1805
- "epoch": 8.391608391608392,
1806
- "grad_norm": 24.712387084960938,
1807
- "learning_rate": 1.6566403867466338e-06,
1808
- "loss": 0.3556,
1809
- "step": 2400
1810
- },
1811
- {
1812
- "epoch": 8.391608391608392,
1813
- "eval_loss": 0.7055321931838989,
1814
- "eval_runtime": 5.3773,
1815
- "eval_samples_per_second": 11.902,
1816
- "eval_steps_per_second": 11.902,
1817
- "step": 2400
1818
- },
1819
- {
1820
- "epoch": 8.426573426573427,
1821
- "grad_norm": 15.138517379760742,
1822
- "learning_rate": 1.6564531076298806e-06,
1823
- "loss": 0.2524,
1824
- "step": 2410
1825
- },
1826
- {
1827
- "epoch": 8.461538461538462,
1828
- "grad_norm": 12.319635391235352,
1829
- "learning_rate": 1.656231795961757e-06,
1830
- "loss": 0.1701,
1831
- "step": 2420
1832
- },
1833
- {
1834
- "epoch": 8.496503496503497,
1835
- "grad_norm": 11.40451431274414,
1836
- "learning_rate": 1.6559764608395181e-06,
1837
- "loss": 0.1976,
1838
- "step": 2430
1839
- },
1840
- {
1841
- "epoch": 8.531468531468532,
1842
- "grad_norm": 6.014434337615967,
1843
- "learning_rate": 1.6556871127589914e-06,
1844
- "loss": 0.2757,
1845
- "step": 2440
1846
- },
1847
- {
1848
- "epoch": 8.566433566433567,
1849
- "grad_norm": 11.623228073120117,
1850
- "learning_rate": 1.6553637636141427e-06,
1851
- "loss": 0.225,
1852
- "step": 2450
1853
- },
1854
- {
1855
- "epoch": 8.601398601398602,
1856
- "grad_norm": 16.436216354370117,
1857
- "learning_rate": 1.6550064266965894e-06,
1858
- "loss": 0.2616,
1859
- "step": 2460
1860
- },
1861
- {
1862
- "epoch": 8.636363636363637,
1863
- "grad_norm": 3.8843276500701904,
1864
- "learning_rate": 1.654615116695052e-06,
1865
- "loss": 0.2325,
1866
- "step": 2470
1867
- },
1868
- {
1869
- "epoch": 8.671328671328672,
1870
- "grad_norm": 12.226479530334473,
1871
- "learning_rate": 1.6541898496947524e-06,
1872
- "loss": 0.3145,
1873
- "step": 2480
1874
- },
1875
- {
1876
- "epoch": 8.706293706293707,
1877
- "grad_norm": 20.737321853637695,
1878
- "learning_rate": 1.6537306431767512e-06,
1879
- "loss": 0.3799,
1880
- "step": 2490
1881
- },
1882
- {
1883
- "epoch": 8.741258741258742,
1884
- "grad_norm": 8.938512802124023,
1885
- "learning_rate": 1.6532375160172298e-06,
1886
- "loss": 0.2303,
1887
- "step": 2500
1888
- },
1889
- {
1890
- "epoch": 8.776223776223777,
1891
- "grad_norm": 18.450578689575195,
1892
- "learning_rate": 1.652710488486714e-06,
1893
- "loss": 0.2347,
1894
- "step": 2510
1895
- },
1896
- {
1897
- "epoch": 8.811188811188812,
1898
- "grad_norm": 3.751770496368408,
1899
- "learning_rate": 1.6521495822492419e-06,
1900
- "loss": 0.3062,
1901
- "step": 2520
1902
- },
1903
- {
1904
- "epoch": 8.846153846153847,
1905
- "grad_norm": 10.544769287109375,
1906
- "learning_rate": 1.6515548203614717e-06,
1907
- "loss": 0.2331,
1908
- "step": 2530
1909
- },
1910
- {
1911
- "epoch": 8.881118881118882,
1912
- "grad_norm": 20.1557559967041,
1913
- "learning_rate": 1.650926227271735e-06,
1914
- "loss": 0.2732,
1915
- "step": 2540
1916
- },
1917
- {
1918
- "epoch": 8.916083916083917,
1919
- "grad_norm": 16.894678115844727,
1920
- "learning_rate": 1.6502638288190317e-06,
1921
- "loss": 0.3163,
1922
- "step": 2550
1923
- },
1924
- {
1925
- "epoch": 8.916083916083917,
1926
- "eval_loss": 0.70442795753479,
1927
- "eval_runtime": 5.4147,
1928
- "eval_samples_per_second": 11.82,
1929
- "eval_steps_per_second": 11.82,
1930
- "step": 2550
1931
- },
1932
- {
1933
- "epoch": 8.951048951048952,
1934
- "grad_norm": 10.007589340209961,
1935
- "learning_rate": 1.649567652231968e-06,
1936
- "loss": 0.2869,
1937
- "step": 2560
1938
- },
1939
- {
1940
- "epoch": 8.986013986013987,
1941
- "grad_norm": 9.064579963684082,
1942
- "learning_rate": 1.6488377261276365e-06,
1943
- "loss": 0.3091,
1944
- "step": 2570
1945
- },
1946
- {
1947
- "epoch": 9.020979020979022,
1948
- "grad_norm": 11.941916465759277,
1949
- "learning_rate": 1.6480740805104402e-06,
1950
- "loss": 0.303,
1951
- "step": 2580
1952
- },
1953
- {
1954
- "epoch": 9.055944055944057,
1955
- "grad_norm": 10.506561279296875,
1956
- "learning_rate": 1.6472767467708597e-06,
1957
- "loss": 0.3275,
1958
- "step": 2590
1959
- },
1960
- {
1961
- "epoch": 9.090909090909092,
1962
- "grad_norm": 5.165719985961914,
1963
- "learning_rate": 1.6464457576841624e-06,
1964
- "loss": 0.3021,
1965
- "step": 2600
1966
- },
1967
- {
1968
- "epoch": 9.125874125874127,
1969
- "grad_norm": 15.611152648925781,
1970
- "learning_rate": 1.6455811474090539e-06,
1971
- "loss": 0.198,
1972
- "step": 2610
1973
- },
1974
- {
1975
- "epoch": 9.16083916083916,
1976
- "grad_norm": 10.496397018432617,
1977
- "learning_rate": 1.6446829514862772e-06,
1978
- "loss": 0.1879,
1979
- "step": 2620
1980
- },
1981
- {
1982
- "epoch": 9.195804195804195,
1983
- "grad_norm": 10.941329002380371,
1984
- "learning_rate": 1.6437512068371481e-06,
1985
- "loss": 0.1579,
1986
- "step": 2630
1987
- },
1988
- {
1989
- "epoch": 9.23076923076923,
1990
- "grad_norm": 6.740403652191162,
1991
- "learning_rate": 1.6427859517620401e-06,
1992
- "loss": 0.2877,
1993
- "step": 2640
1994
- },
1995
- {
1996
- "epoch": 9.265734265734265,
1997
- "grad_norm": 10.660934448242188,
1998
- "learning_rate": 1.6417872259388084e-06,
1999
- "loss": 0.3105,
2000
- "step": 2650
2001
- },
2002
- {
2003
- "epoch": 9.3006993006993,
2004
- "grad_norm": 12.463582992553711,
2005
- "learning_rate": 1.6407550704211601e-06,
2006
- "loss": 0.3788,
2007
- "step": 2660
2008
- },
2009
- {
2010
- "epoch": 9.335664335664335,
2011
- "grad_norm": 15.10091781616211,
2012
- "learning_rate": 1.6396895276369656e-06,
2013
- "loss": 0.2632,
2014
- "step": 2670
2015
- },
2016
- {
2017
- "epoch": 9.37062937062937,
2018
- "grad_norm": 15.757782936096191,
2019
- "learning_rate": 1.6385906413865154e-06,
2020
- "loss": 0.2933,
2021
- "step": 2680
2022
- },
2023
- {
2024
- "epoch": 9.405594405594405,
2025
- "grad_norm": 6.034332752227783,
2026
- "learning_rate": 1.6374584568407192e-06,
2027
- "loss": 0.2493,
2028
- "step": 2690
2029
- },
2030
- {
2031
- "epoch": 9.44055944055944,
2032
- "grad_norm": 8.291542053222656,
2033
- "learning_rate": 1.6362930205392493e-06,
2034
- "loss": 0.2736,
2035
- "step": 2700
2036
- },
2037
- {
2038
- "epoch": 9.44055944055944,
2039
- "eval_loss": 0.7050421833992004,
2040
- "eval_runtime": 5.3694,
2041
- "eval_samples_per_second": 11.919,
2042
- "eval_steps_per_second": 11.919,
2043
- "step": 2700
2044
  }
2045
  ],
2046
  "logging_steps": 10,
2047
  "max_steps": 5000,
2048
  "num_input_tokens_seen": 0,
2049
- "num_train_epochs": 18,
2050
  "save_steps": 150,
2051
  "stateful_callbacks": {
2052
  "TrainerControl": {
@@ -2060,8 +139,8 @@
2060
  "attributes": {}
2061
  }
2062
  },
2063
- "total_flos": 3.475745395708723e+16,
2064
- "train_batch_size": 2,
2065
  "trial_name": null,
2066
  "trial_params": null
2067
  }
 
1
  {
2
+ "best_metric": 1.851840853691101,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.19157088122605365,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01277139208173691,
13
+ "grad_norm": 4.169848918914795,
14
+ "learning_rate": 3.4694137880813836e-06,
15
+ "loss": 1.5133,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.02554278416347382,
20
+ "grad_norm": 4.01235294342041,
21
+ "learning_rate": 6.938827576162767e-06,
22
+ "loss": 1.4784,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.038314176245210725,
27
+ "grad_norm": 4.190587043762207,
28
+ "learning_rate": 1.0408241364244149e-05,
29
+ "loss": 1.4826,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.05108556832694764,
34
+ "grad_norm": 4.053359031677246,
35
+ "learning_rate": 1.3877655152325534e-05,
36
+ "loss": 1.4692,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.06385696040868455,
41
+ "grad_norm": 3.621781349182129,
42
+ "learning_rate": 1.7347068940406916e-05,
43
+ "loss": 1.4071,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.07662835249042145,
48
+ "grad_norm": 3.5056958198547363,
49
+ "learning_rate": 2.0816482728488298e-05,
50
+ "loss": 1.3342,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.08939974457215837,
55
+ "grad_norm": 3.9697868824005127,
56
+ "learning_rate": 2.428589651656968e-05,
57
+ "loss": 1.4,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.10217113665389528,
62
+ "grad_norm": 4.204615592956543,
63
+ "learning_rate": 2.775531030465107e-05,
64
+ "loss": 1.4064,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.11494252873563218,
69
+ "grad_norm": 4.627971172332764,
70
+ "learning_rate": 3.122472409273245e-05,
71
+ "loss": 1.6121,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.1277139208173691,
76
+ "grad_norm": 4.687252521514893,
77
+ "learning_rate": 3.469413788081383e-05,
78
+ "loss": 1.8409,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.140485312899106,
83
+ "grad_norm": 4.754763126373291,
84
+ "learning_rate": 3.4693781345783675e-05,
85
+ "loss": 1.7964,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.1532567049808429,
90
+ "grad_norm": 4.089818477630615,
91
+ "learning_rate": 3.469271175534895e-05,
92
+ "loss": 1.855,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.16602809706257982,
97
+ "grad_norm": 4.230756759643555,
98
+ "learning_rate": 3.469092915347635e-05,
99
+ "loss": 1.7698,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.17879948914431673,
104
+ "grad_norm": 3.6243300437927246,
105
+ "learning_rate": 3.468843361344164e-05,
106
+ "loss": 1.7402,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.19157088122605365,
111
+ "grad_norm": 3.6760964393615723,
112
+ "learning_rate": 3.46852252378267e-05,
113
+ "loss": 1.8524,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.19157088122605365,
118
+ "eval_loss": 1.851840853691101,
119
+ "eval_runtime": 44.0581,
120
+ "eval_samples_per_second": 11.349,
121
+ "eval_steps_per_second": 11.349,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
126
  "max_steps": 5000,
127
  "num_input_tokens_seen": 0,
128
+ "num_train_epochs": 7,
129
  "save_steps": 150,
130
  "stateful_callbacks": {
131
  "TrainerControl": {
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 1.6492029466902528e+16,
143
+ "train_batch_size": 16,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a0d5bba2c9918c620fba2221cb48c4a965010d48cc94b490be628c47019c308
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c167a5959afcf2631a587e2c79af9dc4334a99f07862c928036bfa576ad08c6b
3
  size 5496
last-checkpoint/vocab.json ADDED
The diff for this file is too large to render. See raw diff