starlineventures commited on
Commit
231f780
·
verified ·
1 Parent(s): 2e9ff27

starlineventures/pilot-talk

Browse files
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- base_model: microsoft/phi-2
3
  library_name: peft
4
  license: mit
5
  tags:
6
  - trl
7
  - sft
 
8
  - generated_from_trainer
9
  model-index:
10
  - name: outputs
@@ -16,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # outputs
18
 
19
- This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on an unknown dataset.
20
 
21
  ## Model description
22
 
@@ -36,7 +37,7 @@ More information needed
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 0.0001
39
- - train_batch_size: 10
40
  - eval_batch_size: 16
41
  - seed: 3407
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
1
  ---
2
+ base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3
  library_name: peft
4
  license: mit
5
  tags:
6
  - trl
7
  - sft
8
+ - finetuned
9
  - generated_from_trainer
10
  model-index:
11
  - name: outputs
 
17
 
18
  # outputs
19
 
20
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on an unknown dataset.
21
 
22
  ## Model description
23
 
 
37
 
38
  The following hyperparameters were used during training:
39
  - learning_rate: 0.0001
40
+ - train_batch_size: 4
41
  - eval_batch_size: 16
42
  - seed: 3407
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
adapter_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": {
4
- "base_model_class": "PhiForCausalLM",
5
- "parent_library": "transformers.models.phi.modeling_phi"
6
  },
7
- "base_model_name_or_path": "microsoft/phi-2",
8
  "bias": "none",
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
@@ -26,9 +26,9 @@
26
  "down_proj",
27
  "k_proj",
28
  "up_proj",
29
- "v_proj",
30
  "q_proj",
31
  "gate_proj",
 
32
  "o_proj"
33
  ],
34
  "task_type": null,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": {
4
+ "base_model_class": "Qwen2ForCausalLM",
5
+ "parent_library": "transformers.models.qwen2.modeling_qwen2"
6
  },
7
+ "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
8
  "bias": "none",
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
 
26
  "down_proj",
27
  "k_proj",
28
  "up_proj",
 
29
  "q_proj",
30
  "gate_proj",
31
+ "v_proj",
32
  "o_proj"
33
  ],
34
  "task_type": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f3a1cea7d969cef21da106d3e9c7d41b5283af202e93bf96911eebc61146215
3
- size 94422368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60d95b10b6e140a9626a7058d5038528f2ff80148dc4569b881db56052046509
3
+ size 40
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.125,
3
  "total_flos": 0.0,
4
- "train_loss": 0.2409752929911894,
5
- "train_runtime": 174.6588,
6
- "train_samples_per_second": 13.741,
7
- "train_steps_per_second": 1.374
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.11339402707914512,
5
+ "train_runtime": 443.7998,
6
+ "train_samples_per_second": 4.326,
7
+ "train_steps_per_second": 1.082
8
  }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 131072,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 10000,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.44.2",
25
+ "use_cache": false,
26
+ "use_mrope": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.44.2"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef0da30790a2243077da2841088a57e8b854924ab9dd99a138cac9fb09043fb8
3
+ size 3554214752
special_tokens_map.json CHANGED
@@ -1,24 +1,17 @@
1
  {
2
  "bos_token": {
3
- "content": "<|endoftext|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|endoftext|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
  }
 
1
  {
2
  "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "<|end▁of▁sentence|>"
 
 
 
 
 
 
 
17
  }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,327 +1,194 @@
1
  {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
 
4
  "added_tokens_decoder": {
5
- "50256": {
6
- "content": "<|endoftext|>",
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
- "50257": {
14
- "content": " ",
15
  "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": false
20
- },
21
- "50258": {
22
- "content": " ",
23
- "lstrip": false,
24
- "normalized": true,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": false
28
- },
29
- "50259": {
30
- "content": " ",
31
- "lstrip": false,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "50260": {
38
- "content": " ",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "50261": {
46
- "content": " ",
47
- "lstrip": false,
48
- "normalized": true,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "50262": {
54
- "content": " ",
55
- "lstrip": false,
56
- "normalized": true,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
- "50263": {
62
- "content": " ",
63
- "lstrip": false,
64
- "normalized": true,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": false
68
- },
69
- "50264": {
70
- "content": " ",
71
- "lstrip": false,
72
- "normalized": true,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": false
76
- },
77
- "50265": {
78
- "content": " ",
79
- "lstrip": false,
80
- "normalized": true,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": false
84
- },
85
- "50266": {
86
- "content": " ",
87
- "lstrip": false,
88
- "normalized": true,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": false
92
- },
93
- "50267": {
94
- "content": " ",
95
- "lstrip": false,
96
- "normalized": true,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "50268": {
102
- "content": " ",
103
- "lstrip": false,
104
- "normalized": true,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": false
108
- },
109
- "50269": {
110
- "content": " ",
111
- "lstrip": false,
112
- "normalized": true,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "50270": {
118
- "content": " ",
119
- "lstrip": false,
120
- "normalized": true,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "50271": {
126
- "content": " ",
127
- "lstrip": false,
128
- "normalized": true,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "50272": {
134
- "content": " ",
135
- "lstrip": false,
136
- "normalized": true,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "50273": {
142
- "content": " ",
143
- "lstrip": false,
144
- "normalized": true,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "50274": {
150
- "content": " ",
151
- "lstrip": false,
152
- "normalized": true,
153
  "rstrip": false,
154
  "single_word": false,
155
  "special": false
156
  },
157
- "50275": {
158
- "content": " ",
159
  "lstrip": false,
160
- "normalized": true,
161
  "rstrip": false,
162
  "single_word": false,
163
  "special": false
164
  },
165
- "50276": {
166
- "content": " ",
167
  "lstrip": false,
168
- "normalized": true,
169
  "rstrip": false,
170
  "single_word": false,
171
- "special": false
172
  },
173
- "50277": {
174
- "content": " ",
175
  "lstrip": false,
176
- "normalized": true,
177
  "rstrip": false,
178
  "single_word": false,
179
  "special": false
180
  },
181
- "50278": {
182
- "content": " ",
183
  "lstrip": false,
184
- "normalized": true,
185
  "rstrip": false,
186
  "single_word": false,
187
  "special": false
188
  },
189
- "50279": {
190
- "content": " ",
191
  "lstrip": false,
192
- "normalized": true,
193
  "rstrip": false,
194
  "single_word": false,
195
  "special": false
196
  },
197
- "50280": {
198
- "content": " ",
199
  "lstrip": false,
200
- "normalized": true,
201
  "rstrip": false,
202
  "single_word": false,
203
- "special": false
204
  },
205
- "50281": {
206
- "content": " ",
207
  "lstrip": false,
208
- "normalized": true,
209
  "rstrip": false,
210
  "single_word": false,
211
- "special": false
212
  },
213
- "50282": {
214
- "content": " ",
215
  "lstrip": false,
216
- "normalized": true,
217
  "rstrip": false,
218
  "single_word": false,
219
- "special": false
220
  },
221
- "50283": {
222
- "content": " ",
223
  "lstrip": false,
224
- "normalized": true,
225
  "rstrip": false,
226
  "single_word": false,
227
- "special": false
228
  },
229
- "50284": {
230
- "content": " ",
231
  "lstrip": false,
232
- "normalized": true,
233
  "rstrip": false,
234
  "single_word": false,
235
- "special": false
236
  },
237
- "50285": {
238
- "content": " ",
239
  "lstrip": false,
240
- "normalized": true,
241
  "rstrip": false,
242
  "single_word": false,
243
- "special": false
244
  },
245
- "50286": {
246
- "content": " ",
247
  "lstrip": false,
248
- "normalized": true,
249
  "rstrip": false,
250
  "single_word": false,
251
- "special": false
252
  },
253
- "50287": {
254
- "content": "\t\t\t\t\t\t\t\t\t",
255
  "lstrip": false,
256
- "normalized": true,
257
  "rstrip": false,
258
  "single_word": false,
259
  "special": false
260
  },
261
- "50288": {
262
- "content": "\t\t\t\t\t\t\t\t",
263
  "lstrip": false,
264
- "normalized": true,
265
  "rstrip": false,
266
  "single_word": false,
267
  "special": false
268
  },
269
- "50289": {
270
- "content": "\t\t\t\t\t\t\t",
271
  "lstrip": false,
272
- "normalized": true,
273
  "rstrip": false,
274
  "single_word": false,
275
  "special": false
276
  },
277
- "50290": {
278
- "content": "\t\t\t\t\t\t",
279
  "lstrip": false,
280
- "normalized": true,
281
  "rstrip": false,
282
  "single_word": false,
283
  "special": false
284
  },
285
- "50291": {
286
- "content": "\t\t\t\t\t",
287
  "lstrip": false,
288
- "normalized": true,
289
  "rstrip": false,
290
  "single_word": false,
291
  "special": false
292
  },
293
- "50292": {
294
- "content": "\t\t\t\t",
295
  "lstrip": false,
296
- "normalized": true,
297
  "rstrip": false,
298
  "single_word": false,
299
  "special": false
300
  },
301
- "50293": {
302
- "content": "\t\t\t",
303
  "lstrip": false,
304
- "normalized": true,
305
  "rstrip": false,
306
  "single_word": false,
307
  "special": false
308
  },
309
- "50294": {
310
- "content": "\t\t",
311
  "lstrip": false,
312
- "normalized": true,
313
  "rstrip": false,
314
  "single_word": false,
315
  "special": false
316
  }
317
  },
318
- "bos_token": "<|endoftext|>",
319
- "clean_up_tokenization_spaces": true,
320
- "eos_token": "<|endoftext|>",
321
- "errors": "replace",
 
322
  "model_max_length": 2048,
323
- "pad_token": "<|endoftext|>",
324
- "return_token_type_ids": false,
325
- "tokenizer_class": "CodeGenTokenizer",
326
- "unk_token": "<|endoftext|>"
 
327
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|end▁of▁sentence|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
13
  },
14
+ "151644": {
15
+ "content": "<|User|>",
16
  "lstrip": false,
17
+ "normalized": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": false
21
  },
22
+ "151645": {
23
+ "content": "<|Assistant|>",
24
  "lstrip": false,
25
+ "normalized": false,
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": false
29
  },
30
+ "151646": {
31
+ "content": "<|begin▁of▁sentence|>",
32
  "lstrip": false,
33
+ "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
+ "special": true
37
  },
38
+ "151647": {
39
+ "content": "<|EOT|>",
40
  "lstrip": false,
41
+ "normalized": false,
42
  "rstrip": false,
43
  "single_word": false,
44
  "special": false
45
  },
46
+ "151648": {
47
+ "content": "<think>",
48
  "lstrip": false,
49
+ "normalized": false,
50
  "rstrip": false,
51
  "single_word": false,
52
  "special": false
53
  },
54
+ "151649": {
55
+ "content": "</think>",
56
  "lstrip": false,
57
+ "normalized": false,
58
  "rstrip": false,
59
  "single_word": false,
60
  "special": false
61
  },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
  "lstrip": false,
65
+ "normalized": false,
66
  "rstrip": false,
67
  "single_word": false,
68
+ "special": true
69
  },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
  "lstrip": false,
73
+ "normalized": false,
74
  "rstrip": false,
75
  "single_word": false,
76
+ "special": true
77
  },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
  "lstrip": false,
81
+ "normalized": false,
82
  "rstrip": false,
83
  "single_word": false,
84
+ "special": true
85
  },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
  "lstrip": false,
89
+ "normalized": false,
90
  "rstrip": false,
91
  "single_word": false,
92
+ "special": true
93
  },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
  "lstrip": false,
97
+ "normalized": false,
98
  "rstrip": false,
99
  "single_word": false,
100
+ "special": true
101
  },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
  "lstrip": false,
105
+ "normalized": false,
106
  "rstrip": false,
107
  "single_word": false,
108
+ "special": true
109
  },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
  "lstrip": false,
113
+ "normalized": false,
114
  "rstrip": false,
115
  "single_word": false,
116
+ "special": true
117
  },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
  "lstrip": false,
121
+ "normalized": false,
122
  "rstrip": false,
123
  "single_word": false,
124
  "special": false
125
  },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
  "lstrip": false,
129
+ "normalized": false,
130
  "rstrip": false,
131
  "single_word": false,
132
  "special": false
133
  },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
  "lstrip": false,
137
+ "normalized": false,
138
  "rstrip": false,
139
  "single_word": false,
140
  "special": false
141
  },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
  "lstrip": false,
145
+ "normalized": false,
146
  "rstrip": false,
147
  "single_word": false,
148
  "special": false
149
  },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
  "lstrip": false,
153
+ "normalized": false,
154
  "rstrip": false,
155
  "single_word": false,
156
  "special": false
157
  },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
  "lstrip": false,
161
+ "normalized": false,
162
  "rstrip": false,
163
  "single_word": false,
164
  "special": false
165
  },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
  "lstrip": false,
169
+ "normalized": false,
170
  "rstrip": false,
171
  "single_word": false,
172
  "special": false
173
  },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
  "lstrip": false,
177
+ "normalized": false,
178
  "rstrip": false,
179
  "single_word": false,
180
  "special": false
181
  }
182
  },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
185
+ "clean_up_tokenization_spaces": false,
186
+ "eos_token": "<|end▁of▁sentence|>",
187
+ "legacy": true,
188
  "model_max_length": 2048,
189
+ "pad_token": "<|end▁of▁sentence|>",
190
+ "sp_model_kwargs": {},
191
+ "tokenizer_class": "LlamaTokenizer",
192
+ "unk_token": null,
193
+ "use_default_system_prompt": false
194
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.125,
3
  "total_flos": 0.0,
4
- "train_loss": 0.2409752929911894,
5
- "train_runtime": 174.6588,
6
- "train_samples_per_second": 13.741,
7
- "train_steps_per_second": 1.374
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.11339402707914512,
5
+ "train_runtime": 443.7998,
6
+ "train_samples_per_second": 4.326,
7
+ "train_steps_per_second": 1.082
8
  }
trainer_state.json CHANGED
@@ -1,144 +1,389 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.125,
5
  "eval_steps": 500,
6
- "global_step": 170,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.125,
13
- "grad_norm": 35.11359405517578,
14
  "learning_rate": 9.583333333333334e-05,
15
- "loss": 1.3709,
16
- "step": 10
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.25,
20
- "grad_norm": 28.059900283813477,
21
  "learning_rate": 9.166666666666667e-05,
22
- "loss": 0.4762,
23
- "step": 20
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.375,
27
- "grad_norm": 30.004230499267578,
28
  "learning_rate": 8.75e-05,
29
- "loss": 0.2966,
30
- "step": 30
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.5,
34
- "grad_norm": 30.531251907348633,
35
  "learning_rate": 8.333333333333334e-05,
36
- "loss": 0.2166,
37
- "step": 40
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.625,
41
- "grad_norm": 25.447418212890625,
42
  "learning_rate": 7.916666666666666e-05,
43
- "loss": 0.1816,
44
- "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.75,
48
- "grad_norm": 25.023717880249023,
49
  "learning_rate": 7.500000000000001e-05,
50
- "loss": 0.1653,
51
- "step": 60
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.875,
55
- "grad_norm": 23.13313102722168,
56
  "learning_rate": 7.083333333333334e-05,
57
- "loss": 0.1506,
58
- "step": 70
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 1.0,
62
- "grad_norm": 21.35039710998535,
63
  "learning_rate": 6.666666666666667e-05,
64
- "loss": 0.1369,
65
- "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 1.125,
69
- "grad_norm": 19.913026809692383,
70
  "learning_rate": 6.25e-05,
71
- "loss": 0.1315,
72
- "step": 90
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 1.25,
76
- "grad_norm": 21.789073944091797,
77
  "learning_rate": 5.833333333333334e-05,
78
- "loss": 0.1296,
79
- "step": 100
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 1.375,
83
- "grad_norm": 25.12034797668457,
84
  "learning_rate": 5.4166666666666664e-05,
85
- "loss": 0.1279,
86
- "step": 110
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 1.5,
90
- "grad_norm": 25.35553550720215,
91
  "learning_rate": 5e-05,
92
- "loss": 0.1241,
93
- "step": 120
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 1.625,
97
- "grad_norm": 23.74753761291504,
98
  "learning_rate": 4.5833333333333334e-05,
99
- "loss": 0.1208,
100
- "step": 130
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 1.75,
104
- "grad_norm": 23.058244705200195,
105
  "learning_rate": 4.166666666666667e-05,
106
- "loss": 0.1188,
107
- "step": 140
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 1.875,
111
- "grad_norm": 22.608642578125,
112
  "learning_rate": 3.7500000000000003e-05,
113
- "loss": 0.1174,
114
- "step": 150
 
 
 
 
 
 
 
115
  },
116
  {
117
  "epoch": 2.0,
118
- "grad_norm": 22.311994552612305,
119
  "learning_rate": 3.3333333333333335e-05,
120
- "loss": 0.1162,
121
- "step": 160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 2.125,
125
- "grad_norm": 23.028535842895508,
126
  "learning_rate": 2.916666666666667e-05,
127
- "loss": 0.1158,
128
- "step": 170
129
  },
130
  {
131
- "epoch": 2.125,
132
- "step": 170,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  "total_flos": 0.0,
134
- "train_loss": 0.2409752929911894,
135
- "train_runtime": 174.6588,
136
- "train_samples_per_second": 13.741,
137
- "train_steps_per_second": 1.374
138
  }
139
  ],
140
  "logging_steps": 10,
141
- "max_steps": 240,
142
  "num_input_tokens_seen": 0,
143
  "num_train_epochs": 3,
144
  "save_steps": 500,
@@ -155,7 +400,7 @@
155
  }
156
  },
157
  "total_flos": 0.0,
158
- "train_batch_size": 10,
159
  "trial_name": null,
160
  "trial_params": null
161
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.0625,
13
+ "grad_norm": 0.6764543056488037,
14
+ "learning_rate": 9.791666666666667e-05,
15
+ "loss": 4.0135,
16
+ "step": 10
17
+ },
18
  {
19
  "epoch": 0.125,
20
+ "grad_norm": 0.2798255681991577,
21
  "learning_rate": 9.583333333333334e-05,
22
+ "loss": 0.1079,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.1875,
27
+ "grad_norm": 0.2010556012392044,
28
+ "learning_rate": 9.375e-05,
29
+ "loss": 0.0619,
30
+ "step": 30
31
  },
32
  {
33
  "epoch": 0.25,
34
+ "grad_norm": 0.19590723514556885,
35
  "learning_rate": 9.166666666666667e-05,
36
+ "loss": 0.0474,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.3125,
41
+ "grad_norm": 0.1653718799352646,
42
+ "learning_rate": 8.958333333333335e-05,
43
+ "loss": 0.0417,
44
+ "step": 50
45
  },
46
  {
47
  "epoch": 0.375,
48
+ "grad_norm": 0.19019761681556702,
49
  "learning_rate": 8.75e-05,
50
+ "loss": 0.037,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.4375,
55
+ "grad_norm": 0.18890416622161865,
56
+ "learning_rate": 8.541666666666666e-05,
57
+ "loss": 0.0339,
58
+ "step": 70
59
  },
60
  {
61
  "epoch": 0.5,
62
+ "grad_norm": 0.19193948805332184,
63
  "learning_rate": 8.333333333333334e-05,
64
+ "loss": 0.0317,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.5625,
69
+ "grad_norm": 0.12777990102767944,
70
+ "learning_rate": 8.125000000000001e-05,
71
+ "loss": 0.0301,
72
+ "step": 90
73
  },
74
  {
75
  "epoch": 0.625,
76
+ "grad_norm": 0.12041394412517548,
77
  "learning_rate": 7.916666666666666e-05,
78
+ "loss": 0.0287,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.6875,
83
+ "grad_norm": 0.1440989226102829,
84
+ "learning_rate": 7.708333333333334e-05,
85
+ "loss": 0.0288,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.6875,
90
+ "eval_runtime": 12.6929,
91
+ "eval_samples_per_second": 12.606,
92
+ "eval_steps_per_second": 0.788,
93
+ "step": 110
94
  },
95
  {
96
  "epoch": 0.75,
97
+ "grad_norm": 0.14094920456409454,
98
  "learning_rate": 7.500000000000001e-05,
99
+ "loss": 0.0284,
100
+ "step": 120
101
+ },
102
+ {
103
+ "epoch": 0.8125,
104
+ "grad_norm": 0.13362859189510345,
105
+ "learning_rate": 7.291666666666667e-05,
106
+ "loss": 0.028,
107
+ "step": 130
108
  },
109
  {
110
  "epoch": 0.875,
111
+ "grad_norm": 0.10230981558561325,
112
  "learning_rate": 7.083333333333334e-05,
113
+ "loss": 0.0278,
114
+ "step": 140
115
+ },
116
+ {
117
+ "epoch": 0.9375,
118
+ "grad_norm": 0.10484622418880463,
119
+ "learning_rate": 6.875e-05,
120
+ "loss": 0.0277,
121
+ "step": 150
122
  },
123
  {
124
  "epoch": 1.0,
125
+ "grad_norm": 0.1045694574713707,
126
  "learning_rate": 6.666666666666667e-05,
127
+ "loss": 0.0277,
128
+ "step": 160
129
+ },
130
+ {
131
+ "epoch": 1.0,
132
+ "eval_runtime": 11.2891,
133
+ "eval_samples_per_second": 14.173,
134
+ "eval_steps_per_second": 0.886,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 1.0625,
139
+ "grad_norm": 0.11217521131038666,
140
+ "learning_rate": 6.458333333333334e-05,
141
+ "loss": 0.0267,
142
+ "step": 170
143
  },
144
  {
145
  "epoch": 1.125,
146
+ "grad_norm": 0.1110721006989479,
147
  "learning_rate": 6.25e-05,
148
+ "loss": 0.027,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 1.1875,
153
+ "grad_norm": 0.12359625101089478,
154
+ "learning_rate": 6.041666666666667e-05,
155
+ "loss": 0.0272,
156
+ "step": 190
157
  },
158
  {
159
  "epoch": 1.25,
160
+ "grad_norm": 0.10066195577383041,
161
  "learning_rate": 5.833333333333334e-05,
162
+ "loss": 0.027,
163
+ "step": 200
164
+ },
165
+ {
166
+ "epoch": 1.3125,
167
+ "grad_norm": 0.10776817798614502,
168
+ "learning_rate": 5.6250000000000005e-05,
169
+ "loss": 0.0269,
170
+ "step": 210
171
  },
172
  {
173
  "epoch": 1.375,
174
+ "grad_norm": 0.10342445224523544,
175
  "learning_rate": 5.4166666666666664e-05,
176
+ "loss": 0.027,
177
+ "step": 220
178
+ },
179
+ {
180
+ "epoch": 1.4375,
181
+ "grad_norm": 0.0953444242477417,
182
+ "learning_rate": 5.208333333333334e-05,
183
+ "loss": 0.027,
184
+ "step": 230
185
  },
186
  {
187
  "epoch": 1.5,
188
+ "grad_norm": 0.10930311679840088,
189
  "learning_rate": 5e-05,
190
+ "loss": 0.0269,
191
+ "step": 240
192
+ },
193
+ {
194
+ "epoch": 1.5625,
195
+ "grad_norm": 0.11344899982213974,
196
+ "learning_rate": 4.791666666666667e-05,
197
+ "loss": 0.0268,
198
+ "step": 250
199
  },
200
  {
201
  "epoch": 1.625,
202
+ "grad_norm": 0.10314708203077316,
203
  "learning_rate": 4.5833333333333334e-05,
204
+ "loss": 0.0267,
205
+ "step": 260
206
+ },
207
+ {
208
+ "epoch": 1.6875,
209
+ "grad_norm": 0.11634312570095062,
210
+ "learning_rate": 4.375e-05,
211
+ "loss": 0.0266,
212
+ "step": 270
213
  },
214
  {
215
  "epoch": 1.75,
216
+ "grad_norm": 0.09650956094264984,
217
  "learning_rate": 4.166666666666667e-05,
218
+ "loss": 0.0267,
219
+ "step": 280
220
+ },
221
+ {
222
+ "epoch": 1.8125,
223
+ "grad_norm": 0.10790986567735672,
224
+ "learning_rate": 3.958333333333333e-05,
225
+ "loss": 0.0266,
226
+ "step": 290
227
  },
228
  {
229
  "epoch": 1.875,
230
+ "grad_norm": 0.10706381499767303,
231
  "learning_rate": 3.7500000000000003e-05,
232
+ "loss": 0.0264,
233
+ "step": 300
234
+ },
235
+ {
236
+ "epoch": 1.9375,
237
+ "grad_norm": 0.11093062162399292,
238
+ "learning_rate": 3.541666666666667e-05,
239
+ "loss": 0.0266,
240
+ "step": 310
241
  },
242
  {
243
  "epoch": 2.0,
244
+ "grad_norm": 0.10339082777500153,
245
  "learning_rate": 3.3333333333333335e-05,
246
+ "loss": 0.0265,
247
+ "step": 320
248
+ },
249
+ {
250
+ "epoch": 2.0,
251
+ "eval_runtime": 11.299,
252
+ "eval_samples_per_second": 14.161,
253
+ "eval_steps_per_second": 0.885,
254
+ "step": 320
255
+ },
256
+ {
257
+ "epoch": 2.0625,
258
+ "grad_norm": 0.10130282491445541,
259
+ "learning_rate": 3.125e-05,
260
+ "loss": 0.026,
261
+ "step": 330
262
  },
263
  {
264
  "epoch": 2.125,
265
+ "grad_norm": 0.10614955425262451,
266
  "learning_rate": 2.916666666666667e-05,
267
+ "loss": 0.0257,
268
+ "step": 340
269
  },
270
  {
271
+ "epoch": 2.1875,
272
+ "grad_norm": 0.1143997386097908,
273
+ "learning_rate": 2.7083333333333332e-05,
274
+ "loss": 0.0261,
275
+ "step": 350
276
+ },
277
+ {
278
+ "epoch": 2.25,
279
+ "grad_norm": 0.10745866596698761,
280
+ "learning_rate": 2.5e-05,
281
+ "loss": 0.0256,
282
+ "step": 360
283
+ },
284
+ {
285
+ "epoch": 2.3125,
286
+ "grad_norm": 0.1030006930232048,
287
+ "learning_rate": 2.2916666666666667e-05,
288
+ "loss": 0.0258,
289
+ "step": 370
290
+ },
291
+ {
292
+ "epoch": 2.375,
293
+ "grad_norm": 0.10611753165721893,
294
+ "learning_rate": 2.0833333333333336e-05,
295
+ "loss": 0.026,
296
+ "step": 380
297
+ },
298
+ {
299
+ "epoch": 2.4375,
300
+ "grad_norm": 0.10169661045074463,
301
+ "learning_rate": 1.8750000000000002e-05,
302
+ "loss": 0.0257,
303
+ "step": 390
304
+ },
305
+ {
306
+ "epoch": 2.5,
307
+ "grad_norm": 0.11938793212175369,
308
+ "learning_rate": 1.6666666666666667e-05,
309
+ "loss": 0.0258,
310
+ "step": 400
311
+ },
312
+ {
313
+ "epoch": 2.5625,
314
+ "grad_norm": 0.0978316143155098,
315
+ "learning_rate": 1.4583333333333335e-05,
316
+ "loss": 0.0259,
317
+ "step": 410
318
+ },
319
+ {
320
+ "epoch": 2.625,
321
+ "grad_norm": 0.09611309319734573,
322
+ "learning_rate": 1.25e-05,
323
+ "loss": 0.0258,
324
+ "step": 420
325
+ },
326
+ {
327
+ "epoch": 2.6875,
328
+ "grad_norm": 0.10421048849821091,
329
+ "learning_rate": 1.0416666666666668e-05,
330
+ "loss": 0.0258,
331
+ "step": 430
332
+ },
333
+ {
334
+ "epoch": 2.75,
335
+ "grad_norm": 0.09502692520618439,
336
+ "learning_rate": 8.333333333333334e-06,
337
+ "loss": 0.0257,
338
+ "step": 440
339
+ },
340
+ {
341
+ "epoch": 2.8125,
342
+ "grad_norm": 0.10091052949428558,
343
+ "learning_rate": 6.25e-06,
344
+ "loss": 0.0256,
345
+ "step": 450
346
+ },
347
+ {
348
+ "epoch": 2.875,
349
+ "grad_norm": 0.10061470419168472,
350
+ "learning_rate": 4.166666666666667e-06,
351
+ "loss": 0.0257,
352
+ "step": 460
353
+ },
354
+ {
355
+ "epoch": 2.9375,
356
+ "grad_norm": 0.09805355966091156,
357
+ "learning_rate": 2.0833333333333334e-06,
358
+ "loss": 0.0255,
359
+ "step": 470
360
+ },
361
+ {
362
+ "epoch": 3.0,
363
+ "grad_norm": 0.11753270030021667,
364
+ "learning_rate": 0.0,
365
+ "loss": 0.0256,
366
+ "step": 480
367
+ },
368
+ {
369
+ "epoch": 3.0,
370
+ "eval_runtime": 11.3126,
371
+ "eval_samples_per_second": 14.144,
372
+ "eval_steps_per_second": 0.884,
373
+ "step": 480
374
+ },
375
+ {
376
+ "epoch": 3.0,
377
+ "step": 480,
378
  "total_flos": 0.0,
379
+ "train_loss": 0.11339402707914512,
380
+ "train_runtime": 443.7998,
381
+ "train_samples_per_second": 4.326,
382
+ "train_steps_per_second": 1.082
383
  }
384
  ],
385
  "logging_steps": 10,
386
+ "max_steps": 480,
387
  "num_input_tokens_seen": 0,
388
  "num_train_epochs": 3,
389
  "save_steps": 500,
 
400
  }
401
  },
402
  "total_flos": 0.0,
403
+ "train_batch_size": 4,
404
  "trial_name": null,
405
  "trial_params": null
406
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bf77fa574b65faae23151f91dc99b2b3a380810ab2b3e9268e86e0152036049
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3ea36db01dbb1c5915e1ff553ab1d2d31cd6119900ddbf872951d161132b290
3
  size 5432