Aze4ka commited on
Commit
e3b0db7
·
verified ·
1 Parent(s): 99e1b17

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-24328/source.spm filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-24328/target.spm filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-36492/source.spm filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-36492/target.spm filter=lfs diff=lfs merge=lfs -text
40
+ source.spm filter=lfs diff=lfs merge=lfs -text
41
+ target.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ library_name: transformers
4
+ tags:
5
+ - autotrain
6
+ - text2text-generation
7
+ base_model: Helsinki-NLP/opus-mt-en-ru
8
+ widget:
9
+ - text: "I love AutoTrain"
10
+ ---
11
+
12
+ # Model Trained Using AutoTrain
13
+
14
+ - Problem type: Seq2Seq
15
+
16
+ ## Validation Metrics
17
+ loss: 0.5498164296150208
18
+
19
+ rouge1: 32.6453
20
+
21
+ rouge2: 28.31
22
+
23
+ rougeL: 32.1048
24
+
25
+ rougeLsum: 32.1708
26
+
27
+ gen_len: 33.8306
28
+
29
+ runtime: 1530.1218
30
+
31
+ samples_per_second: 14.132
32
+
33
+ steps_per_second: 0.442
34
+
35
+ : 3.0
checkpoint-24328/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-en-ru",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "swish",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "MarianMTModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 62517,
21
+ "decoder_vocab_size": 62518,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 0,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 62517,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.48.0",
54
+ "use_cache": false,
55
+ "vocab_size": 62518
56
+ }
checkpoint-24328/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 62517
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 62517,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 62517,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.48.0"
16
+ }
checkpoint-24328/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6f5eef1d23257a753025aed5e386686bd8b7ab5dff373ae663be0b3be07bc92
3
+ size 304869976
checkpoint-24328/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:038a1951d0f8e7666720a62721a9af9b3297936481db72ce9cb8b0f92b47e378
3
+ size 609393274
checkpoint-24328/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:494c89fc9a0a31848381908a1476f97062c3b7ac43f5d026334eb3667939ea1e
3
+ size 14512
checkpoint-24328/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8b05e18c01f9892cc1a0dccf8d4ad9ec04f7a1ad47e787bab7483f83f5d698
3
+ size 14512
checkpoint-24328/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7444e21e803fb824eee408a2cde84cd09e218a6ef226645fd65f99a9d4fa88cb
3
+ size 1064
checkpoint-24328/source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16bebef1389a0b8ab452772c4e35b9e605e5713f8ac7baa71ca701394eaa086d
3
+ size 802781
checkpoint-24328/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
checkpoint-24328/target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745998e51ba5b058e38b7ac7765c25c43ed5c1c39cc92b27163b9b2e323c9d7c
3
+ size 1080169
checkpoint-24328/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "62517": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "ru",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
checkpoint-24328/trainer_state.json ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5606569051742554,
3
+ "best_model_checkpoint": "ck3-localization/checkpoint-24328",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 24328,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.041104899704044726,
13
+ "grad_norm": 6.12867546081543,
14
+ "learning_rate": 2.7397260273972604e-06,
15
+ "loss": 2.3513,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.08220979940808945,
20
+ "grad_norm": 6.480109691619873,
21
+ "learning_rate": 5.479452054794521e-06,
22
+ "loss": 1.4689,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.12331469911213416,
27
+ "grad_norm": 4.740694046020508,
28
+ "learning_rate": 8.219178082191782e-06,
29
+ "loss": 1.19,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.1644195988161789,
34
+ "grad_norm": 5.501825332641602,
35
+ "learning_rate": 1.0958904109589042e-05,
36
+ "loss": 1.0318,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.20552449852022361,
41
+ "grad_norm": 5.322640419006348,
42
+ "learning_rate": 1.3698630136986302e-05,
43
+ "loss": 0.9556,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.24662939822426833,
48
+ "grad_norm": 5.773024559020996,
49
+ "learning_rate": 1.6438356164383563e-05,
50
+ "loss": 0.9137,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.28773429792831307,
55
+ "grad_norm": 4.7130656242370605,
56
+ "learning_rate": 1.9178082191780822e-05,
57
+ "loss": 0.8705,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.3288391976323578,
62
+ "grad_norm": 3.756901741027832,
63
+ "learning_rate": 1.9786858291212475e-05,
64
+ "loss": 0.8531,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.3699440973364025,
69
+ "grad_norm": 4.033214092254639,
70
+ "learning_rate": 1.948237013580172e-05,
71
+ "loss": 0.8138,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.41104899704044723,
76
+ "grad_norm": 4.564835071563721,
77
+ "learning_rate": 1.9177881980390966e-05,
78
+ "loss": 0.7946,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.45215389674449197,
83
+ "grad_norm": 4.850255012512207,
84
+ "learning_rate": 1.887339382498021e-05,
85
+ "loss": 0.7833,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.49325879644853665,
90
+ "grad_norm": 4.716800689697266,
91
+ "learning_rate": 1.8568905669569456e-05,
92
+ "loss": 0.7662,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.5343636961525814,
97
+ "grad_norm": 4.0613274574279785,
98
+ "learning_rate": 1.82644175141587e-05,
99
+ "loss": 0.7668,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.5754685958566261,
104
+ "grad_norm": 4.6606831550598145,
105
+ "learning_rate": 1.7959929358747943e-05,
106
+ "loss": 0.7518,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.6165734955606709,
111
+ "grad_norm": 4.745144844055176,
112
+ "learning_rate": 1.765544120333719e-05,
113
+ "loss": 0.7477,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.6576783952647156,
118
+ "grad_norm": 4.414773464202881,
119
+ "learning_rate": 1.7350953047926437e-05,
120
+ "loss": 0.751,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.6987832949687602,
125
+ "grad_norm": 4.531556606292725,
126
+ "learning_rate": 1.704646489251568e-05,
127
+ "loss": 0.7424,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.739888194672805,
132
+ "grad_norm": 4.841054439544678,
133
+ "learning_rate": 1.6741976737104928e-05,
134
+ "loss": 0.7328,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.7809930943768497,
139
+ "grad_norm": 4.302633762359619,
140
+ "learning_rate": 1.6437488581694175e-05,
141
+ "loss": 0.725,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.8220979940808945,
146
+ "grad_norm": 3.530836343765259,
147
+ "learning_rate": 1.6133000426283418e-05,
148
+ "loss": 0.7072,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.8632028937849392,
153
+ "grad_norm": 4.2016191482543945,
154
+ "learning_rate": 1.5828512270872665e-05,
155
+ "loss": 0.6895,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.9043077934889839,
160
+ "grad_norm": 3.463226079940796,
161
+ "learning_rate": 1.5524024115461912e-05,
162
+ "loss": 0.7093,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.9454126931930286,
167
+ "grad_norm": 3.3261220455169678,
168
+ "learning_rate": 1.5219535960051155e-05,
169
+ "loss": 0.6941,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.9865175928970733,
174
+ "grad_norm": 3.778331756591797,
175
+ "learning_rate": 1.49150478046404e-05,
176
+ "loss": 0.7008,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 1.0,
181
+ "eval_gen_len": 33.8514,
182
+ "eval_loss": 0.6079365611076355,
183
+ "eval_rouge1": 31.9733,
184
+ "eval_rouge2": 27.2686,
185
+ "eval_rougeL": 31.3526,
186
+ "eval_rougeLsum": 31.4543,
187
+ "eval_runtime": 1591.2259,
188
+ "eval_samples_per_second": 13.59,
189
+ "eval_steps_per_second": 0.425,
190
+ "step": 12164
191
+ },
192
+ {
193
+ "epoch": 1.0276224926011182,
194
+ "grad_norm": 3.7499120235443115,
195
+ "learning_rate": 1.4610559649229646e-05,
196
+ "loss": 0.6284,
197
+ "step": 12500
198
+ },
199
+ {
200
+ "epoch": 1.0687273923051628,
201
+ "grad_norm": 3.52150821685791,
202
+ "learning_rate": 1.4306071493818891e-05,
203
+ "loss": 0.6337,
204
+ "step": 13000
205
+ },
206
+ {
207
+ "epoch": 1.1098322920092074,
208
+ "grad_norm": 4.520621299743652,
209
+ "learning_rate": 1.4001583338408138e-05,
210
+ "loss": 0.6292,
211
+ "step": 13500
212
+ },
213
+ {
214
+ "epoch": 1.1509371917132523,
215
+ "grad_norm": 4.275709629058838,
216
+ "learning_rate": 1.3697095182997382e-05,
217
+ "loss": 0.6195,
218
+ "step": 14000
219
+ },
220
+ {
221
+ "epoch": 1.192042091417297,
222
+ "grad_norm": 3.500743865966797,
223
+ "learning_rate": 1.3392607027586628e-05,
224
+ "loss": 0.6168,
225
+ "step": 14500
226
+ },
227
+ {
228
+ "epoch": 1.2331469911213417,
229
+ "grad_norm": 2.587315320968628,
230
+ "learning_rate": 1.3088118872175875e-05,
231
+ "loss": 0.6169,
232
+ "step": 15000
233
+ },
234
+ {
235
+ "epoch": 1.2742518908253864,
236
+ "grad_norm": 3.4539663791656494,
237
+ "learning_rate": 1.2783630716765119e-05,
238
+ "loss": 0.619,
239
+ "step": 15500
240
+ },
241
+ {
242
+ "epoch": 1.3153567905294312,
243
+ "grad_norm": 3.759758710861206,
244
+ "learning_rate": 1.2479142561354364e-05,
245
+ "loss": 0.612,
246
+ "step": 16000
247
+ },
248
+ {
249
+ "epoch": 1.3564616902334758,
250
+ "grad_norm": 2.6160857677459717,
251
+ "learning_rate": 1.217465440594361e-05,
252
+ "loss": 0.6179,
253
+ "step": 16500
254
+ },
255
+ {
256
+ "epoch": 1.3975665899375205,
257
+ "grad_norm": 3.3576176166534424,
258
+ "learning_rate": 1.1870166250532855e-05,
259
+ "loss": 0.6203,
260
+ "step": 17000
261
+ },
262
+ {
263
+ "epoch": 1.4386714896415653,
264
+ "grad_norm": 2.6305747032165527,
265
+ "learning_rate": 1.1565678095122101e-05,
266
+ "loss": 0.609,
267
+ "step": 17500
268
+ },
269
+ {
270
+ "epoch": 1.47977638934561,
271
+ "grad_norm": 3.3986129760742188,
272
+ "learning_rate": 1.1261189939711345e-05,
273
+ "loss": 0.6076,
274
+ "step": 18000
275
+ },
276
+ {
277
+ "epoch": 1.5208812890496546,
278
+ "grad_norm": 5.06044864654541,
279
+ "learning_rate": 1.0956701784300592e-05,
280
+ "loss": 0.6187,
281
+ "step": 18500
282
+ },
283
+ {
284
+ "epoch": 1.5619861887536994,
285
+ "grad_norm": 4.3670549392700195,
286
+ "learning_rate": 1.0652213628889839e-05,
287
+ "loss": 0.6063,
288
+ "step": 19000
289
+ },
290
+ {
291
+ "epoch": 1.6030910884577443,
292
+ "grad_norm": 8.928343772888184,
293
+ "learning_rate": 1.0347725473479082e-05,
294
+ "loss": 0.5948,
295
+ "step": 19500
296
+ },
297
+ {
298
+ "epoch": 1.644195988161789,
299
+ "grad_norm": 3.5316221714019775,
300
+ "learning_rate": 1.004323731806833e-05,
301
+ "loss": 0.6108,
302
+ "step": 20000
303
+ },
304
+ {
305
+ "epoch": 1.6853008878658335,
306
+ "grad_norm": 3.8091230392456055,
307
+ "learning_rate": 9.738749162657574e-06,
308
+ "loss": 0.6024,
309
+ "step": 20500
310
+ },
311
+ {
312
+ "epoch": 1.7264057875698784,
313
+ "grad_norm": 2.5065314769744873,
314
+ "learning_rate": 9.43426100724682e-06,
315
+ "loss": 0.595,
316
+ "step": 21000
317
+ },
318
+ {
319
+ "epoch": 1.767510687273923,
320
+ "grad_norm": 4.371850490570068,
321
+ "learning_rate": 9.129772851836063e-06,
322
+ "loss": 0.6063,
323
+ "step": 21500
324
+ },
325
+ {
326
+ "epoch": 1.8086155869779676,
327
+ "grad_norm": 3.6098592281341553,
328
+ "learning_rate": 8.82528469642531e-06,
329
+ "loss": 0.6158,
330
+ "step": 22000
331
+ },
332
+ {
333
+ "epoch": 1.8497204866820125,
334
+ "grad_norm": 4.037623405456543,
335
+ "learning_rate": 8.520796541014555e-06,
336
+ "loss": 0.6045,
337
+ "step": 22500
338
+ },
339
+ {
340
+ "epoch": 1.8908253863860573,
341
+ "grad_norm": 4.389125823974609,
342
+ "learning_rate": 8.2163083856038e-06,
343
+ "loss": 0.5876,
344
+ "step": 23000
345
+ },
346
+ {
347
+ "epoch": 1.931930286090102,
348
+ "grad_norm": 4.564250946044922,
349
+ "learning_rate": 7.911820230193046e-06,
350
+ "loss": 0.5857,
351
+ "step": 23500
352
+ },
353
+ {
354
+ "epoch": 1.9730351857941466,
355
+ "grad_norm": 4.7007365226745605,
356
+ "learning_rate": 7.607332074782292e-06,
357
+ "loss": 0.6188,
358
+ "step": 24000
359
+ },
360
+ {
361
+ "epoch": 2.0,
362
+ "eval_gen_len": 34.1084,
363
+ "eval_loss": 0.5606569051742554,
364
+ "eval_rouge1": 32.2894,
365
+ "eval_rouge2": 27.8387,
366
+ "eval_rougeL": 31.7251,
367
+ "eval_rougeLsum": 31.8152,
368
+ "eval_runtime": 1524.3799,
369
+ "eval_samples_per_second": 14.185,
370
+ "eval_steps_per_second": 0.443,
371
+ "step": 24328
372
+ }
373
+ ],
374
+ "logging_steps": 500,
375
+ "max_steps": 36492,
376
+ "num_input_tokens_seen": 0,
377
+ "num_train_epochs": 3,
378
+ "save_steps": 500,
379
+ "stateful_callbacks": {
380
+ "EarlyStoppingCallback": {
381
+ "args": {
382
+ "early_stopping_patience": 5,
383
+ "early_stopping_threshold": 0.01
384
+ },
385
+ "attributes": {
386
+ "early_stopping_patience_counter": 0
387
+ }
388
+ },
389
+ "TrainerControl": {
390
+ "args": {
391
+ "should_epoch_stop": false,
392
+ "should_evaluate": false,
393
+ "should_log": false,
394
+ "should_save": true,
395
+ "should_training_stop": false
396
+ },
397
+ "attributes": {}
398
+ }
399
+ },
400
+ "total_flos": 8379853808074752.0,
401
+ "train_batch_size": 8,
402
+ "trial_name": null,
403
+ "trial_params": null
404
+ }
checkpoint-24328/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6cdaabc5186df4c830688bac90d2f77a2412e2b54715163a34f977335aa8ac
3
+ size 5496
checkpoint-24328/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-36492/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-en-ru",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "swish",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "MarianMTModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 62517,
21
+ "decoder_vocab_size": 62518,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 0,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 62517,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.48.0",
54
+ "use_cache": false,
55
+ "vocab_size": 62518
56
+ }
checkpoint-36492/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 62517
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 62517,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 62517,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.48.0"
16
+ }
checkpoint-36492/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299782ae957802fbb6326a979be87baa6e1af0752fc628ad1b8603565ba98c82
3
+ size 304869976
checkpoint-36492/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43073178bdc78381cd9e75014954229a2d8ce3961fa0d6f5530c2d3e02123e67
3
+ size 609393274
checkpoint-36492/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feb2244b9e3c379b21e267dc0e4b5b119f0584d6561dba538bd6d64835907f19
3
+ size 14512
checkpoint-36492/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f36686ed5b4af5f2fc7e7972a2d7bd91bbf27fdaf653759297e9ef53d9e57d
3
+ size 14512
checkpoint-36492/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f6550b50dab793d8df12535663f382e792fe37fddac1a48240ad18ddaa075d6
3
+ size 1064
checkpoint-36492/source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16bebef1389a0b8ab452772c4e35b9e605e5713f8ac7baa71ca701394eaa086d
3
+ size 802781
checkpoint-36492/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
checkpoint-36492/target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745998e51ba5b058e38b7ac7765c25c43ed5c1c39cc92b27163b9b2e323c9d7c
3
+ size 1080169
checkpoint-36492/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "62517": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "ru",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
checkpoint-36492/trainer_state.json ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5498164296150208,
3
+ "best_model_checkpoint": "ck3-localization/checkpoint-36492",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 36492,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.041104899704044726,
13
+ "grad_norm": 6.12867546081543,
14
+ "learning_rate": 2.7397260273972604e-06,
15
+ "loss": 2.3513,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.08220979940808945,
20
+ "grad_norm": 6.480109691619873,
21
+ "learning_rate": 5.479452054794521e-06,
22
+ "loss": 1.4689,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.12331469911213416,
27
+ "grad_norm": 4.740694046020508,
28
+ "learning_rate": 8.219178082191782e-06,
29
+ "loss": 1.19,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.1644195988161789,
34
+ "grad_norm": 5.501825332641602,
35
+ "learning_rate": 1.0958904109589042e-05,
36
+ "loss": 1.0318,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.20552449852022361,
41
+ "grad_norm": 5.322640419006348,
42
+ "learning_rate": 1.3698630136986302e-05,
43
+ "loss": 0.9556,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.24662939822426833,
48
+ "grad_norm": 5.773024559020996,
49
+ "learning_rate": 1.6438356164383563e-05,
50
+ "loss": 0.9137,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.28773429792831307,
55
+ "grad_norm": 4.7130656242370605,
56
+ "learning_rate": 1.9178082191780822e-05,
57
+ "loss": 0.8705,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.3288391976323578,
62
+ "grad_norm": 3.756901741027832,
63
+ "learning_rate": 1.9786858291212475e-05,
64
+ "loss": 0.8531,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.3699440973364025,
69
+ "grad_norm": 4.033214092254639,
70
+ "learning_rate": 1.948237013580172e-05,
71
+ "loss": 0.8138,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.41104899704044723,
76
+ "grad_norm": 4.564835071563721,
77
+ "learning_rate": 1.9177881980390966e-05,
78
+ "loss": 0.7946,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.45215389674449197,
83
+ "grad_norm": 4.850255012512207,
84
+ "learning_rate": 1.887339382498021e-05,
85
+ "loss": 0.7833,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.49325879644853665,
90
+ "grad_norm": 4.716800689697266,
91
+ "learning_rate": 1.8568905669569456e-05,
92
+ "loss": 0.7662,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.5343636961525814,
97
+ "grad_norm": 4.0613274574279785,
98
+ "learning_rate": 1.82644175141587e-05,
99
+ "loss": 0.7668,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.5754685958566261,
104
+ "grad_norm": 4.6606831550598145,
105
+ "learning_rate": 1.7959929358747943e-05,
106
+ "loss": 0.7518,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.6165734955606709,
111
+ "grad_norm": 4.745144844055176,
112
+ "learning_rate": 1.765544120333719e-05,
113
+ "loss": 0.7477,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.6576783952647156,
118
+ "grad_norm": 4.414773464202881,
119
+ "learning_rate": 1.7350953047926437e-05,
120
+ "loss": 0.751,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.6987832949687602,
125
+ "grad_norm": 4.531556606292725,
126
+ "learning_rate": 1.704646489251568e-05,
127
+ "loss": 0.7424,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.739888194672805,
132
+ "grad_norm": 4.841054439544678,
133
+ "learning_rate": 1.6741976737104928e-05,
134
+ "loss": 0.7328,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.7809930943768497,
139
+ "grad_norm": 4.302633762359619,
140
+ "learning_rate": 1.6437488581694175e-05,
141
+ "loss": 0.725,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.8220979940808945,
146
+ "grad_norm": 3.530836343765259,
147
+ "learning_rate": 1.6133000426283418e-05,
148
+ "loss": 0.7072,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.8632028937849392,
153
+ "grad_norm": 4.2016191482543945,
154
+ "learning_rate": 1.5828512270872665e-05,
155
+ "loss": 0.6895,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.9043077934889839,
160
+ "grad_norm": 3.463226079940796,
161
+ "learning_rate": 1.5524024115461912e-05,
162
+ "loss": 0.7093,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.9454126931930286,
167
+ "grad_norm": 3.3261220455169678,
168
+ "learning_rate": 1.5219535960051155e-05,
169
+ "loss": 0.6941,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.9865175928970733,
174
+ "grad_norm": 3.778331756591797,
175
+ "learning_rate": 1.49150478046404e-05,
176
+ "loss": 0.7008,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 1.0,
181
+ "eval_gen_len": 33.8514,
182
+ "eval_loss": 0.6079365611076355,
183
+ "eval_rouge1": 31.9733,
184
+ "eval_rouge2": 27.2686,
185
+ "eval_rougeL": 31.3526,
186
+ "eval_rougeLsum": 31.4543,
187
+ "eval_runtime": 1591.2259,
188
+ "eval_samples_per_second": 13.59,
189
+ "eval_steps_per_second": 0.425,
190
+ "step": 12164
191
+ },
192
+ {
193
+ "epoch": 1.0276224926011182,
194
+ "grad_norm": 3.7499120235443115,
195
+ "learning_rate": 1.4610559649229646e-05,
196
+ "loss": 0.6284,
197
+ "step": 12500
198
+ },
199
+ {
200
+ "epoch": 1.0687273923051628,
201
+ "grad_norm": 3.52150821685791,
202
+ "learning_rate": 1.4306071493818891e-05,
203
+ "loss": 0.6337,
204
+ "step": 13000
205
+ },
206
+ {
207
+ "epoch": 1.1098322920092074,
208
+ "grad_norm": 4.520621299743652,
209
+ "learning_rate": 1.4001583338408138e-05,
210
+ "loss": 0.6292,
211
+ "step": 13500
212
+ },
213
+ {
214
+ "epoch": 1.1509371917132523,
215
+ "grad_norm": 4.275709629058838,
216
+ "learning_rate": 1.3697095182997382e-05,
217
+ "loss": 0.6195,
218
+ "step": 14000
219
+ },
220
+ {
221
+ "epoch": 1.192042091417297,
222
+ "grad_norm": 3.500743865966797,
223
+ "learning_rate": 1.3392607027586628e-05,
224
+ "loss": 0.6168,
225
+ "step": 14500
226
+ },
227
+ {
228
+ "epoch": 1.2331469911213417,
229
+ "grad_norm": 2.587315320968628,
230
+ "learning_rate": 1.3088118872175875e-05,
231
+ "loss": 0.6169,
232
+ "step": 15000
233
+ },
234
+ {
235
+ "epoch": 1.2742518908253864,
236
+ "grad_norm": 3.4539663791656494,
237
+ "learning_rate": 1.2783630716765119e-05,
238
+ "loss": 0.619,
239
+ "step": 15500
240
+ },
241
+ {
242
+ "epoch": 1.3153567905294312,
243
+ "grad_norm": 3.759758710861206,
244
+ "learning_rate": 1.2479142561354364e-05,
245
+ "loss": 0.612,
246
+ "step": 16000
247
+ },
248
+ {
249
+ "epoch": 1.3564616902334758,
250
+ "grad_norm": 2.6160857677459717,
251
+ "learning_rate": 1.217465440594361e-05,
252
+ "loss": 0.6179,
253
+ "step": 16500
254
+ },
255
+ {
256
+ "epoch": 1.3975665899375205,
257
+ "grad_norm": 3.3576176166534424,
258
+ "learning_rate": 1.1870166250532855e-05,
259
+ "loss": 0.6203,
260
+ "step": 17000
261
+ },
262
+ {
263
+ "epoch": 1.4386714896415653,
264
+ "grad_norm": 2.6305747032165527,
265
+ "learning_rate": 1.1565678095122101e-05,
266
+ "loss": 0.609,
267
+ "step": 17500
268
+ },
269
+ {
270
+ "epoch": 1.47977638934561,
271
+ "grad_norm": 3.3986129760742188,
272
+ "learning_rate": 1.1261189939711345e-05,
273
+ "loss": 0.6076,
274
+ "step": 18000
275
+ },
276
+ {
277
+ "epoch": 1.5208812890496546,
278
+ "grad_norm": 5.06044864654541,
279
+ "learning_rate": 1.0956701784300592e-05,
280
+ "loss": 0.6187,
281
+ "step": 18500
282
+ },
283
+ {
284
+ "epoch": 1.5619861887536994,
285
+ "grad_norm": 4.3670549392700195,
286
+ "learning_rate": 1.0652213628889839e-05,
287
+ "loss": 0.6063,
288
+ "step": 19000
289
+ },
290
+ {
291
+ "epoch": 1.6030910884577443,
292
+ "grad_norm": 8.928343772888184,
293
+ "learning_rate": 1.0347725473479082e-05,
294
+ "loss": 0.5948,
295
+ "step": 19500
296
+ },
297
+ {
298
+ "epoch": 1.644195988161789,
299
+ "grad_norm": 3.5316221714019775,
300
+ "learning_rate": 1.004323731806833e-05,
301
+ "loss": 0.6108,
302
+ "step": 20000
303
+ },
304
+ {
305
+ "epoch": 1.6853008878658335,
306
+ "grad_norm": 3.8091230392456055,
307
+ "learning_rate": 9.738749162657574e-06,
308
+ "loss": 0.6024,
309
+ "step": 20500
310
+ },
311
+ {
312
+ "epoch": 1.7264057875698784,
313
+ "grad_norm": 2.5065314769744873,
314
+ "learning_rate": 9.43426100724682e-06,
315
+ "loss": 0.595,
316
+ "step": 21000
317
+ },
318
+ {
319
+ "epoch": 1.767510687273923,
320
+ "grad_norm": 4.371850490570068,
321
+ "learning_rate": 9.129772851836063e-06,
322
+ "loss": 0.6063,
323
+ "step": 21500
324
+ },
325
+ {
326
+ "epoch": 1.8086155869779676,
327
+ "grad_norm": 3.6098592281341553,
328
+ "learning_rate": 8.82528469642531e-06,
329
+ "loss": 0.6158,
330
+ "step": 22000
331
+ },
332
+ {
333
+ "epoch": 1.8497204866820125,
334
+ "grad_norm": 4.037623405456543,
335
+ "learning_rate": 8.520796541014555e-06,
336
+ "loss": 0.6045,
337
+ "step": 22500
338
+ },
339
+ {
340
+ "epoch": 1.8908253863860573,
341
+ "grad_norm": 4.389125823974609,
342
+ "learning_rate": 8.2163083856038e-06,
343
+ "loss": 0.5876,
344
+ "step": 23000
345
+ },
346
+ {
347
+ "epoch": 1.931930286090102,
348
+ "grad_norm": 4.564250946044922,
349
+ "learning_rate": 7.911820230193046e-06,
350
+ "loss": 0.5857,
351
+ "step": 23500
352
+ },
353
+ {
354
+ "epoch": 1.9730351857941466,
355
+ "grad_norm": 4.7007365226745605,
356
+ "learning_rate": 7.607332074782292e-06,
357
+ "loss": 0.6188,
358
+ "step": 24000
359
+ },
360
+ {
361
+ "epoch": 2.0,
362
+ "eval_gen_len": 34.1084,
363
+ "eval_loss": 0.5606569051742554,
364
+ "eval_rouge1": 32.2894,
365
+ "eval_rouge2": 27.8387,
366
+ "eval_rougeL": 31.7251,
367
+ "eval_rougeLsum": 31.8152,
368
+ "eval_runtime": 1524.3799,
369
+ "eval_samples_per_second": 14.185,
370
+ "eval_steps_per_second": 0.443,
371
+ "step": 24328
372
+ },
373
+ {
374
+ "epoch": 2.0141400854981915,
375
+ "grad_norm": 1.9762619733810425,
376
+ "learning_rate": 7.302843919371537e-06,
377
+ "loss": 0.5736,
378
+ "step": 24500
379
+ },
380
+ {
381
+ "epoch": 2.0552449852022363,
382
+ "grad_norm": 4.161807060241699,
383
+ "learning_rate": 6.998355763960782e-06,
384
+ "loss": 0.5458,
385
+ "step": 25000
386
+ },
387
+ {
388
+ "epoch": 2.0963498849062807,
389
+ "grad_norm": 2.221466541290283,
390
+ "learning_rate": 6.693867608550027e-06,
391
+ "loss": 0.5468,
392
+ "step": 25500
393
+ },
394
+ {
395
+ "epoch": 2.1374547846103256,
396
+ "grad_norm": 3.642603874206543,
397
+ "learning_rate": 6.3893794531392735e-06,
398
+ "loss": 0.5556,
399
+ "step": 26000
400
+ },
401
+ {
402
+ "epoch": 2.1785596843143704,
403
+ "grad_norm": 3.3420753479003906,
404
+ "learning_rate": 6.084891297728519e-06,
405
+ "loss": 0.5451,
406
+ "step": 26500
407
+ },
408
+ {
409
+ "epoch": 2.219664584018415,
410
+ "grad_norm": 3.2656307220458984,
411
+ "learning_rate": 5.780403142317764e-06,
412
+ "loss": 0.5303,
413
+ "step": 27000
414
+ },
415
+ {
416
+ "epoch": 2.2607694837224597,
417
+ "grad_norm": 3.4211599826812744,
418
+ "learning_rate": 5.475914986907009e-06,
419
+ "loss": 0.5496,
420
+ "step": 27500
421
+ },
422
+ {
423
+ "epoch": 2.3018743834265045,
424
+ "grad_norm": 4.965065956115723,
425
+ "learning_rate": 5.171426831496256e-06,
426
+ "loss": 0.5392,
427
+ "step": 28000
428
+ },
429
+ {
430
+ "epoch": 2.342979283130549,
431
+ "grad_norm": 2.4368653297424316,
432
+ "learning_rate": 4.866938676085501e-06,
433
+ "loss": 0.5443,
434
+ "step": 28500
435
+ },
436
+ {
437
+ "epoch": 2.384084182834594,
438
+ "grad_norm": 2.6363422870635986,
439
+ "learning_rate": 4.562450520674746e-06,
440
+ "loss": 0.5418,
441
+ "step": 29000
442
+ },
443
+ {
444
+ "epoch": 2.4251890825386386,
445
+ "grad_norm": 4.417261123657227,
446
+ "learning_rate": 4.257962365263992e-06,
447
+ "loss": 0.5377,
448
+ "step": 29500
449
+ },
450
+ {
451
+ "epoch": 2.4662939822426835,
452
+ "grad_norm": 2.850874900817871,
453
+ "learning_rate": 3.953474209853237e-06,
454
+ "loss": 0.5406,
455
+ "step": 30000
456
+ },
457
+ {
458
+ "epoch": 2.507398881946728,
459
+ "grad_norm": 3.0470714569091797,
460
+ "learning_rate": 3.6489860544424825e-06,
461
+ "loss": 0.5277,
462
+ "step": 30500
463
+ },
464
+ {
465
+ "epoch": 2.5485037816507727,
466
+ "grad_norm": 3.77066707611084,
467
+ "learning_rate": 3.3444978990317278e-06,
468
+ "loss": 0.5404,
469
+ "step": 31000
470
+ },
471
+ {
472
+ "epoch": 2.5896086813548176,
473
+ "grad_norm": 2.103886127471924,
474
+ "learning_rate": 3.0400097436209734e-06,
475
+ "loss": 0.5511,
476
+ "step": 31500
477
+ },
478
+ {
479
+ "epoch": 2.6307135810588624,
480
+ "grad_norm": 2.784128427505493,
481
+ "learning_rate": 2.7355215882102186e-06,
482
+ "loss": 0.5466,
483
+ "step": 32000
484
+ },
485
+ {
486
+ "epoch": 2.671818480762907,
487
+ "grad_norm": 2.9157445430755615,
488
+ "learning_rate": 2.4310334327994643e-06,
489
+ "loss": 0.5502,
490
+ "step": 32500
491
+ },
492
+ {
493
+ "epoch": 2.7129233804669517,
494
+ "grad_norm": 3.7201719284057617,
495
+ "learning_rate": 2.1265452773887095e-06,
496
+ "loss": 0.5435,
497
+ "step": 33000
498
+ },
499
+ {
500
+ "epoch": 2.7540282801709965,
501
+ "grad_norm": 3.0712995529174805,
502
+ "learning_rate": 1.8220571219779551e-06,
503
+ "loss": 0.5442,
504
+ "step": 33500
505
+ },
506
+ {
507
+ "epoch": 2.795133179875041,
508
+ "grad_norm": 3.5120229721069336,
509
+ "learning_rate": 1.5175689665672005e-06,
510
+ "loss": 0.5392,
511
+ "step": 34000
512
+ },
513
+ {
514
+ "epoch": 2.836238079579086,
515
+ "grad_norm": 3.09930682182312,
516
+ "learning_rate": 1.2130808111564462e-06,
517
+ "loss": 0.5355,
518
+ "step": 34500
519
+ },
520
+ {
521
+ "epoch": 2.8773429792831307,
522
+ "grad_norm": 2.3439698219299316,
523
+ "learning_rate": 9.085926557456916e-07,
524
+ "loss": 0.5531,
525
+ "step": 35000
526
+ },
527
+ {
528
+ "epoch": 2.918447878987175,
529
+ "grad_norm": 2.935701608657837,
530
+ "learning_rate": 6.04104500334937e-07,
531
+ "loss": 0.5355,
532
+ "step": 35500
533
+ },
534
+ {
535
+ "epoch": 2.95955277869122,
536
+ "grad_norm": 3.3096513748168945,
537
+ "learning_rate": 2.9961634492418246e-07,
538
+ "loss": 0.5528,
539
+ "step": 36000
540
+ },
541
+ {
542
+ "epoch": 3.0,
543
+ "eval_gen_len": 33.8306,
544
+ "eval_loss": 0.5498164296150208,
545
+ "eval_rouge1": 32.6478,
546
+ "eval_rouge2": 28.3022,
547
+ "eval_rougeL": 32.1001,
548
+ "eval_rougeLsum": 32.1838,
549
+ "eval_runtime": 1535.5456,
550
+ "eval_samples_per_second": 14.082,
551
+ "eval_steps_per_second": 0.44,
552
+ "step": 36492
553
+ }
554
+ ],
555
+ "logging_steps": 500,
556
+ "max_steps": 36492,
557
+ "num_input_tokens_seen": 0,
558
+ "num_train_epochs": 3,
559
+ "save_steps": 500,
560
+ "stateful_callbacks": {
561
+ "EarlyStoppingCallback": {
562
+ "args": {
563
+ "early_stopping_patience": 5,
564
+ "early_stopping_threshold": 0.01
565
+ },
566
+ "attributes": {
567
+ "early_stopping_patience_counter": 0
568
+ }
569
+ },
570
+ "TrainerControl": {
571
+ "args": {
572
+ "should_epoch_stop": false,
573
+ "should_evaluate": false,
574
+ "should_log": false,
575
+ "should_save": true,
576
+ "should_training_stop": true
577
+ },
578
+ "attributes": {}
579
+ }
580
+ },
581
+ "total_flos": 1.2577075189776384e+16,
582
+ "train_batch_size": 8,
583
+ "trial_name": null,
584
+ "trial_params": null
585
+ }
checkpoint-36492/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6cdaabc5186df4c830688bac90d2f77a2412e2b54715163a34f977335aa8ac
3
+ size 5496
checkpoint-36492/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-en-ru",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "swish",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "MarianMTModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 512,
16
+ "decoder_attention_heads": 8,
17
+ "decoder_ffn_dim": 2048,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_start_token_id": 62517,
21
+ "decoder_vocab_size": 62518,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 8,
24
+ "encoder_ffn_dim": 2048,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 6,
27
+ "eos_token_id": 0,
28
+ "forced_eos_token_id": 0,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": null,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "marian",
44
+ "normalize_before": false,
45
+ "normalize_embedding": false,
46
+ "num_beams": null,
47
+ "num_hidden_layers": 6,
48
+ "pad_token_id": 62517,
49
+ "scale_embedding": true,
50
+ "share_encoder_decoder_embeddings": true,
51
+ "static_position_embeddings": true,
52
+ "torch_dtype": "float32",
53
+ "transformers_version": "4.48.0",
54
+ "use_cache": true,
55
+ "vocab_size": 62518
56
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 62517
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 62517,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 62517,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.48.0"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299782ae957802fbb6326a979be87baa6e1af0752fc628ad1b8603565ba98c82
3
+ size 304869976
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16bebef1389a0b8ab452772c4e35b9e605e5713f8ac7baa71ca701394eaa086d
3
+ size 802781
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745998e51ba5b058e38b7ac7765c25c43ed5c1c39cc92b27163b9b2e323c9d7c
3
+ size 1080169
tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "62517": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "eos_token": "</s>",
30
+ "extra_special_tokens": {},
31
+ "model_max_length": 512,
32
+ "pad_token": "<pad>",
33
+ "separate_vocabs": false,
34
+ "source_lang": "en",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "ru",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6cdaabc5186df4c830688bac90d2f77a2412e2b54715163a34f977335aa8ac
3
+ size 5496
training_params.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_path": "ck3-localization/autotrain-data",
3
+ "model": "Helsinki-NLP/opus-mt-en-ru",
4
+ "username": "Aze4ka",
5
+ "seed": 42,
6
+ "train_split": "train",
7
+ "valid_split": "validation",
8
+ "project_name": "ck3-localization",
9
+ "push_to_hub": true,
10
+ "text_column": "autotrain_text",
11
+ "target_column": "autotrain_label",
12
+ "lr": 2e-05,
13
+ "epochs": 3,
14
+ "max_seq_length": 512,
15
+ "max_target_length": 128,
16
+ "batch_size": 8,
17
+ "warmup_ratio": 0.1,
18
+ "gradient_accumulation": 1,
19
+ "optimizer": "adamw_torch",
20
+ "scheduler": "linear",
21
+ "weight_decay": 0.0,
22
+ "max_grad_norm": 1.0,
23
+ "logging_steps": 500,
24
+ "eval_strategy": "epoch",
25
+ "auto_find_batch_size": false,
26
+ "mixed_precision": null,
27
+ "save_total_limit": 2,
28
+ "peft": false,
29
+ "quantization": "int8",
30
+ "lora_r": 16,
31
+ "lora_alpha": 32,
32
+ "lora_dropout": 0.05,
33
+ "target_modules": "all-linear",
34
+ "log": "wandb",
35
+ "early_stopping_patience": 5,
36
+ "early_stopping_threshold": 0.01
37
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff