andrea colombo commited on
Commit
618f041
·
1 Parent(s): dd45b16

load model

Browse files
README.md CHANGED
@@ -1,10 +1,18 @@
1
  ---
2
- license: apache-2.0
3
- language:
4
- - en
5
- base_model:
6
- - mistralai/Mistral-7B-Instruct-v0.1
7
- pipeline_tag: text2text-generation
8
- tags:
9
- - legal
10
- ---
 
 
 
 
 
 
 
 
 
1
  ---
2
+ base_model: mistralai/Mistral-7B-Instruct-v0.1
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ A Mistral-7B-instruct-v0.1 model to extract the topics from a text of US Public law articles (titles). It is fine-tuned over a set of 18k high quality law title-topics pairs.
9
+
10
+ ## Model Details
11
+
12
+ ### Model Description
13
+
14
+ Finetuned from model: mistralai/Mistral-7B-Instruct-v0.1
15
+ - **Developed by:** Andrea Colombo, Politecnico di Milano
16
+ - **Model type:** text generation
17
+ - **License:** Apache 2.0
18
+ - **Finetuned from model:** mistralai/Mistral-7B-Instruct-v0.1
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a71d6ebf68b79bce475b3a90d44f858ddf633a49974bfca2912a9366e58a1b9
3
+ size 109069176
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10dfed55e6f9ff5a718bfcb57209f7275901977d487fdae4cb7e963ab86412d9
3
+ size 218211962
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f30b6b17e2aa33838e4dd26cc90815b4a23abb2a11e12e834dcba1345cffebb3
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36a1765114743c18ccc9a84fb2ccd51909437c9cc749f3909363dac9c7aa518b
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
trainer_state.json ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 20,
6
+ "global_step": 695,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07194244604316546,
13
+ "grad_norm": 0.7139045596122742,
14
+ "learning_rate": 0.0002,
15
+ "loss": 2.0811,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.14388489208633093,
20
+ "grad_norm": 0.5728554129600525,
21
+ "learning_rate": 0.0002,
22
+ "loss": 1.567,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.2158273381294964,
27
+ "grad_norm": 0.28863629698753357,
28
+ "learning_rate": 0.0002,
29
+ "loss": 1.357,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.28776978417266186,
34
+ "grad_norm": 0.3229255974292755,
35
+ "learning_rate": 0.0002,
36
+ "loss": 1.2521,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.3597122302158273,
41
+ "grad_norm": 0.33988726139068604,
42
+ "learning_rate": 0.0002,
43
+ "loss": 1.2164,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.4316546762589928,
48
+ "grad_norm": 0.40238669514656067,
49
+ "learning_rate": 0.0002,
50
+ "loss": 1.1385,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.5035971223021583,
55
+ "grad_norm": 0.38314664363861084,
56
+ "learning_rate": 0.0002,
57
+ "loss": 1.0973,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.5755395683453237,
62
+ "grad_norm": 0.3884238302707672,
63
+ "learning_rate": 0.0002,
64
+ "loss": 1.0836,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.6474820143884892,
69
+ "grad_norm": 0.3774580657482147,
70
+ "learning_rate": 0.0002,
71
+ "loss": 1.0409,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.7194244604316546,
76
+ "grad_norm": 0.43280744552612305,
77
+ "learning_rate": 0.0002,
78
+ "loss": 1.0462,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.7913669064748201,
83
+ "grad_norm": 0.3758571445941925,
84
+ "learning_rate": 0.0002,
85
+ "loss": 1.0206,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.8633093525179856,
90
+ "grad_norm": 0.3923165798187256,
91
+ "learning_rate": 0.0002,
92
+ "loss": 1.0219,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.935251798561151,
97
+ "grad_norm": 0.4213840663433075,
98
+ "learning_rate": 0.0002,
99
+ "loss": 0.9971,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 1.0,
104
+ "eval_loss": 0.9775540828704834,
105
+ "eval_runtime": 21.5909,
106
+ "eval_samples_per_second": 4.771,
107
+ "eval_steps_per_second": 0.602,
108
+ "step": 139
109
+ },
110
+ {
111
+ "epoch": 1.0071942446043165,
112
+ "grad_norm": 0.40076276659965515,
113
+ "learning_rate": 0.0002,
114
+ "loss": 0.9885,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 1.079136690647482,
119
+ "grad_norm": 0.3987070918083191,
120
+ "learning_rate": 0.0002,
121
+ "loss": 0.9555,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 1.1510791366906474,
126
+ "grad_norm": 0.41527315974235535,
127
+ "learning_rate": 0.0002,
128
+ "loss": 0.9391,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 1.223021582733813,
133
+ "grad_norm": 0.42107248306274414,
134
+ "learning_rate": 0.0002,
135
+ "loss": 0.9332,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 1.2949640287769784,
140
+ "grad_norm": 0.4587080776691437,
141
+ "learning_rate": 0.0002,
142
+ "loss": 0.9163,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 1.3669064748201438,
147
+ "grad_norm": 0.3977225124835968,
148
+ "learning_rate": 0.0002,
149
+ "loss": 0.9142,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 1.4388489208633093,
154
+ "grad_norm": 0.41111239790916443,
155
+ "learning_rate": 0.0002,
156
+ "loss": 0.9088,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 1.5107913669064748,
161
+ "grad_norm": 0.4326966404914856,
162
+ "learning_rate": 0.0002,
163
+ "loss": 0.9082,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 1.5827338129496402,
168
+ "grad_norm": 0.3831544816493988,
169
+ "learning_rate": 0.0002,
170
+ "loss": 0.908,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 1.6546762589928057,
175
+ "grad_norm": 0.39992555975914,
176
+ "learning_rate": 0.0002,
177
+ "loss": 0.9076,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 1.7266187050359711,
182
+ "grad_norm": 0.39961057901382446,
183
+ "learning_rate": 0.0002,
184
+ "loss": 0.9059,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 1.7985611510791366,
189
+ "grad_norm": 0.3854500949382782,
190
+ "learning_rate": 0.0002,
191
+ "loss": 0.8903,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 1.870503597122302,
196
+ "grad_norm": 0.4092749357223511,
197
+ "learning_rate": 0.0002,
198
+ "loss": 0.8904,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 1.9424460431654675,
203
+ "grad_norm": 0.40900877118110657,
204
+ "learning_rate": 0.0002,
205
+ "loss": 0.8679,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 2.0,
210
+ "eval_loss": 0.8940790891647339,
211
+ "eval_runtime": 21.5978,
212
+ "eval_samples_per_second": 4.769,
213
+ "eval_steps_per_second": 0.602,
214
+ "step": 278
215
+ },
216
+ {
217
+ "epoch": 2.014388489208633,
218
+ "grad_norm": 0.3746669888496399,
219
+ "learning_rate": 0.0002,
220
+ "loss": 0.8669,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 2.0863309352517985,
225
+ "grad_norm": 0.4724111258983612,
226
+ "learning_rate": 0.0002,
227
+ "loss": 0.8304,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 2.158273381294964,
232
+ "grad_norm": 0.39437365531921387,
233
+ "learning_rate": 0.0002,
234
+ "loss": 0.8318,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 2.2302158273381294,
239
+ "grad_norm": 0.4238971471786499,
240
+ "learning_rate": 0.0002,
241
+ "loss": 0.829,
242
+ "step": 310
243
+ },
244
+ {
245
+ "epoch": 2.302158273381295,
246
+ "grad_norm": 0.37740206718444824,
247
+ "learning_rate": 0.0002,
248
+ "loss": 0.8235,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 2.3741007194244603,
253
+ "grad_norm": 0.40223780274391174,
254
+ "learning_rate": 0.0002,
255
+ "loss": 0.8328,
256
+ "step": 330
257
+ },
258
+ {
259
+ "epoch": 2.446043165467626,
260
+ "grad_norm": 0.4160473048686981,
261
+ "learning_rate": 0.0002,
262
+ "loss": 0.8252,
263
+ "step": 340
264
+ },
265
+ {
266
+ "epoch": 2.5179856115107913,
267
+ "grad_norm": 0.4427769184112549,
268
+ "learning_rate": 0.0002,
269
+ "loss": 0.8215,
270
+ "step": 350
271
+ },
272
+ {
273
+ "epoch": 2.5899280575539567,
274
+ "grad_norm": 0.47273996472358704,
275
+ "learning_rate": 0.0002,
276
+ "loss": 0.819,
277
+ "step": 360
278
+ },
279
+ {
280
+ "epoch": 2.661870503597122,
281
+ "grad_norm": 0.38981807231903076,
282
+ "learning_rate": 0.0002,
283
+ "loss": 0.8225,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 2.7338129496402876,
288
+ "grad_norm": 0.385079562664032,
289
+ "learning_rate": 0.0002,
290
+ "loss": 0.8045,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 2.805755395683453,
295
+ "grad_norm": 0.3796544075012207,
296
+ "learning_rate": 0.0002,
297
+ "loss": 0.8234,
298
+ "step": 390
299
+ },
300
+ {
301
+ "epoch": 2.8776978417266186,
302
+ "grad_norm": 0.3638385236263275,
303
+ "learning_rate": 0.0002,
304
+ "loss": 0.8195,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 2.949640287769784,
309
+ "grad_norm": 0.37602856755256653,
310
+ "learning_rate": 0.0002,
311
+ "loss": 0.831,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 3.0,
316
+ "eval_loss": 0.8596920967102051,
317
+ "eval_runtime": 21.5962,
318
+ "eval_samples_per_second": 4.769,
319
+ "eval_steps_per_second": 0.602,
320
+ "step": 417
321
+ },
322
+ {
323
+ "epoch": 3.0215827338129495,
324
+ "grad_norm": 0.3858413100242615,
325
+ "learning_rate": 0.0002,
326
+ "loss": 0.8076,
327
+ "step": 420
328
+ },
329
+ {
330
+ "epoch": 3.093525179856115,
331
+ "grad_norm": 0.39952272176742554,
332
+ "learning_rate": 0.0002,
333
+ "loss": 0.7777,
334
+ "step": 430
335
+ },
336
+ {
337
+ "epoch": 3.1654676258992804,
338
+ "grad_norm": 0.42352095246315,
339
+ "learning_rate": 0.0002,
340
+ "loss": 0.7704,
341
+ "step": 440
342
+ },
343
+ {
344
+ "epoch": 3.237410071942446,
345
+ "grad_norm": 0.4132436513900757,
346
+ "learning_rate": 0.0002,
347
+ "loss": 0.768,
348
+ "step": 450
349
+ },
350
+ {
351
+ "epoch": 3.3093525179856114,
352
+ "grad_norm": 0.414110392332077,
353
+ "learning_rate": 0.0002,
354
+ "loss": 0.7631,
355
+ "step": 460
356
+ },
357
+ {
358
+ "epoch": 3.381294964028777,
359
+ "grad_norm": 0.43551069498062134,
360
+ "learning_rate": 0.0002,
361
+ "loss": 0.7673,
362
+ "step": 470
363
+ },
364
+ {
365
+ "epoch": 3.4532374100719423,
366
+ "grad_norm": 0.4042975604534149,
367
+ "learning_rate": 0.0002,
368
+ "loss": 0.7552,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 3.5251798561151078,
373
+ "grad_norm": 0.4289880394935608,
374
+ "learning_rate": 0.0002,
375
+ "loss": 0.7559,
376
+ "step": 490
377
+ },
378
+ {
379
+ "epoch": 3.597122302158273,
380
+ "grad_norm": 0.39530348777770996,
381
+ "learning_rate": 0.0002,
382
+ "loss": 0.7661,
383
+ "step": 500
384
+ },
385
+ {
386
+ "epoch": 3.6690647482014387,
387
+ "grad_norm": 0.4326631724834442,
388
+ "learning_rate": 0.0002,
389
+ "loss": 0.7605,
390
+ "step": 510
391
+ },
392
+ {
393
+ "epoch": 3.741007194244604,
394
+ "grad_norm": 0.3833424150943756,
395
+ "learning_rate": 0.0002,
396
+ "loss": 0.7549,
397
+ "step": 520
398
+ },
399
+ {
400
+ "epoch": 3.81294964028777,
401
+ "grad_norm": 0.40698808431625366,
402
+ "learning_rate": 0.0002,
403
+ "loss": 0.764,
404
+ "step": 530
405
+ },
406
+ {
407
+ "epoch": 3.884892086330935,
408
+ "grad_norm": 0.39018431305885315,
409
+ "learning_rate": 0.0002,
410
+ "loss": 0.7653,
411
+ "step": 540
412
+ },
413
+ {
414
+ "epoch": 3.956834532374101,
415
+ "grad_norm": 0.37253016233444214,
416
+ "learning_rate": 0.0002,
417
+ "loss": 0.7677,
418
+ "step": 550
419
+ },
420
+ {
421
+ "epoch": 4.0,
422
+ "eval_loss": 0.8444326519966125,
423
+ "eval_runtime": 21.5712,
424
+ "eval_samples_per_second": 4.775,
425
+ "eval_steps_per_second": 0.603,
426
+ "step": 556
427
+ },
428
+ {
429
+ "epoch": 4.028776978417266,
430
+ "grad_norm": 0.4029178023338318,
431
+ "learning_rate": 0.0002,
432
+ "loss": 0.7467,
433
+ "step": 560
434
+ },
435
+ {
436
+ "epoch": 4.100719424460432,
437
+ "grad_norm": 0.382028192281723,
438
+ "learning_rate": 0.0002,
439
+ "loss": 0.7116,
440
+ "step": 570
441
+ },
442
+ {
443
+ "epoch": 4.172661870503597,
444
+ "grad_norm": 0.42502614855766296,
445
+ "learning_rate": 0.0002,
446
+ "loss": 0.7298,
447
+ "step": 580
448
+ },
449
+ {
450
+ "epoch": 4.244604316546763,
451
+ "grad_norm": 0.4241486191749573,
452
+ "learning_rate": 0.0002,
453
+ "loss": 0.7073,
454
+ "step": 590
455
+ },
456
+ {
457
+ "epoch": 4.316546762589928,
458
+ "grad_norm": 0.4571862816810608,
459
+ "learning_rate": 0.0002,
460
+ "loss": 0.7106,
461
+ "step": 600
462
+ },
463
+ {
464
+ "epoch": 4.388489208633094,
465
+ "grad_norm": 0.43121734261512756,
466
+ "learning_rate": 0.0002,
467
+ "loss": 0.7031,
468
+ "step": 610
469
+ },
470
+ {
471
+ "epoch": 4.460431654676259,
472
+ "grad_norm": 0.40107443928718567,
473
+ "learning_rate": 0.0002,
474
+ "loss": 0.693,
475
+ "step": 620
476
+ },
477
+ {
478
+ "epoch": 4.532374100719425,
479
+ "grad_norm": 0.4040583670139313,
480
+ "learning_rate": 0.0002,
481
+ "loss": 0.707,
482
+ "step": 630
483
+ },
484
+ {
485
+ "epoch": 4.60431654676259,
486
+ "grad_norm": 0.4380245506763458,
487
+ "learning_rate": 0.0002,
488
+ "loss": 0.7066,
489
+ "step": 640
490
+ },
491
+ {
492
+ "epoch": 4.676258992805756,
493
+ "grad_norm": 0.4263726472854614,
494
+ "learning_rate": 0.0002,
495
+ "loss": 0.7216,
496
+ "step": 650
497
+ },
498
+ {
499
+ "epoch": 4.748201438848921,
500
+ "grad_norm": 0.4532300531864166,
501
+ "learning_rate": 0.0002,
502
+ "loss": 0.7337,
503
+ "step": 660
504
+ },
505
+ {
506
+ "epoch": 4.820143884892087,
507
+ "grad_norm": 0.4051191210746765,
508
+ "learning_rate": 0.0002,
509
+ "loss": 0.7133,
510
+ "step": 670
511
+ },
512
+ {
513
+ "epoch": 4.892086330935252,
514
+ "grad_norm": 0.434962660074234,
515
+ "learning_rate": 0.0002,
516
+ "loss": 0.7144,
517
+ "step": 680
518
+ },
519
+ {
520
+ "epoch": 4.9640287769784175,
521
+ "grad_norm": 0.4131017029285431,
522
+ "learning_rate": 0.0002,
523
+ "loss": 0.7167,
524
+ "step": 690
525
+ },
526
+ {
527
+ "epoch": 5.0,
528
+ "eval_loss": 0.8359549641609192,
529
+ "eval_runtime": 21.598,
530
+ "eval_samples_per_second": 4.769,
531
+ "eval_steps_per_second": 0.602,
532
+ "step": 695
533
+ }
534
+ ],
535
+ "logging_steps": 10,
536
+ "max_steps": 695,
537
+ "num_input_tokens_seen": 0,
538
+ "num_train_epochs": 5,
539
+ "save_steps": 500,
540
+ "stateful_callbacks": {
541
+ "TrainerControl": {
542
+ "args": {
543
+ "should_epoch_stop": false,
544
+ "should_evaluate": false,
545
+ "should_log": false,
546
+ "should_save": true,
547
+ "should_training_stop": true
548
+ },
549
+ "attributes": {}
550
+ }
551
+ },
552
+ "total_flos": 1.8287701530771456e+17,
553
+ "train_batch_size": 3,
554
+ "trial_name": null,
555
+ "trial_params": null
556
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1f554400570d6b18d797afc057c09a63de0465159647f5d5d434e160481611
3
+ size 5432