ajkndfjsdfasdf commited on
Commit
ff2f4fe
·
verified ·
1 Parent(s): 764f5bd

🚀 Full upload with token redacted

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ mt5_training_data-1.jsonl filter=lfs diff=lfs merge=lfs -text
mt5-finetuned/checkpoint-3396/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MT5ForConditionalGeneration"
4
+ ],
5
+ "classifier_dropout": 0.0,
6
+ "d_ff": 1024,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 8,
20
+ "num_heads": 6,
21
+ "num_layers": 8,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "tokenizer_class": "T5Tokenizer",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.50.3",
29
+ "use_cache": true,
30
+ "vocab_size": 250112
31
+ }
mt5-finetuned/checkpoint-3396/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.50.3"
7
+ }
mt5-finetuned/checkpoint-3396/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c79e43eb66e66bcb13da69d4451680d8712e6b0628c70711a82af7e6c57ed8
3
+ size 1200729512
mt5-finetuned/checkpoint-3396/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6101cc68079cd153e117fe790fe37419ddba2b8d0e219425e1f58144677408cd
3
+ size 2401574330
mt5-finetuned/checkpoint-3396/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc85351130da129c85f266e528de4e2ebc79b887b0384adec3bb21d7a490d09
3
+ size 14244
mt5-finetuned/checkpoint-3396/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1022e6be58b62a94302bc1721732cafa60a463bf5853b07dee00ba361293d57
3
+ size 1064
mt5-finetuned/checkpoint-3396/trainer_state.json ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 3396,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005889281507656066,
14
+ "grad_norm": 1069.6029052734375,
15
+ "learning_rate": 1.6339869281045752e-07,
16
+ "loss": 61.3399,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.05889281507656066,
21
+ "grad_norm": 295.9472961425781,
22
+ "learning_rate": 1.6339869281045753e-05,
23
+ "loss": 55.7859,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.05889281507656066,
28
+ "eval_loss": 34.09705352783203,
29
+ "eval_runtime": 0.5351,
30
+ "eval_samples_per_second": 934.481,
31
+ "eval_steps_per_second": 3.738,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.11778563015312132,
36
+ "grad_norm": 397.3686218261719,
37
+ "learning_rate": 3.2679738562091506e-05,
38
+ "loss": 36.4484,
39
+ "step": 200
40
+ },
41
+ {
42
+ "epoch": 0.11778563015312132,
43
+ "eval_loss": 18.607961654663086,
44
+ "eval_runtime": 0.5345,
45
+ "eval_samples_per_second": 935.528,
46
+ "eval_steps_per_second": 3.742,
47
+ "step": 200
48
+ },
49
+ {
50
+ "epoch": 0.17667844522968199,
51
+ "grad_norm": 91.78496551513672,
52
+ "learning_rate": 4.901960784313725e-05,
53
+ "loss": 19.2835,
54
+ "step": 300
55
+ },
56
+ {
57
+ "epoch": 0.17667844522968199,
58
+ "eval_loss": 8.0890474319458,
59
+ "eval_runtime": 0.542,
60
+ "eval_samples_per_second": 922.533,
61
+ "eval_steps_per_second": 3.69,
62
+ "step": 300
63
+ },
64
+ {
65
+ "epoch": 0.23557126030624265,
66
+ "grad_norm": 25.501201629638672,
67
+ "learning_rate": 4.901837928153718e-05,
68
+ "loss": 7.4464,
69
+ "step": 400
70
+ },
71
+ {
72
+ "epoch": 0.23557126030624265,
73
+ "eval_loss": 3.304853677749634,
74
+ "eval_runtime": 0.5365,
75
+ "eval_samples_per_second": 931.913,
76
+ "eval_steps_per_second": 3.728,
77
+ "step": 400
78
+ },
79
+ {
80
+ "epoch": 0.2944640753828033,
81
+ "grad_norm": 23.07789421081543,
82
+ "learning_rate": 4.7974101921470346e-05,
83
+ "loss": 2.2278,
84
+ "step": 500
85
+ },
86
+ {
87
+ "epoch": 0.2944640753828033,
88
+ "eval_loss": 0.6309428811073303,
89
+ "eval_runtime": 0.534,
90
+ "eval_samples_per_second": 936.293,
91
+ "eval_steps_per_second": 3.745,
92
+ "step": 500
93
+ },
94
+ {
95
+ "epoch": 0.35335689045936397,
96
+ "grad_norm": 6.330116271972656,
97
+ "learning_rate": 4.6929824561403515e-05,
98
+ "loss": 0.7991,
99
+ "step": 600
100
+ },
101
+ {
102
+ "epoch": 0.35335689045936397,
103
+ "eval_loss": 0.39563897252082825,
104
+ "eval_runtime": 0.5351,
105
+ "eval_samples_per_second": 934.394,
106
+ "eval_steps_per_second": 3.738,
107
+ "step": 600
108
+ },
109
+ {
110
+ "epoch": 0.4122497055359246,
111
+ "grad_norm": 0.8220232129096985,
112
+ "learning_rate": 4.588554720133668e-05,
113
+ "loss": 0.4644,
114
+ "step": 700
115
+ },
116
+ {
117
+ "epoch": 0.4122497055359246,
118
+ "eval_loss": 0.22928042709827423,
119
+ "eval_runtime": 0.545,
120
+ "eval_samples_per_second": 917.424,
121
+ "eval_steps_per_second": 3.67,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 0.4711425206124853,
126
+ "grad_norm": 0.47425171732902527,
127
+ "learning_rate": 4.4841269841269846e-05,
128
+ "loss": 0.3034,
129
+ "step": 800
130
+ },
131
+ {
132
+ "epoch": 0.4711425206124853,
133
+ "eval_loss": 0.15583929419517517,
134
+ "eval_runtime": 0.5482,
135
+ "eval_samples_per_second": 912.069,
136
+ "eval_steps_per_second": 3.648,
137
+ "step": 800
138
+ },
139
+ {
140
+ "epoch": 0.5300353356890459,
141
+ "grad_norm": 0.33976656198501587,
142
+ "learning_rate": 4.379699248120301e-05,
143
+ "loss": 0.2374,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 0.5300353356890459,
148
+ "eval_loss": 0.13674509525299072,
149
+ "eval_runtime": 0.5433,
150
+ "eval_samples_per_second": 920.268,
151
+ "eval_steps_per_second": 3.681,
152
+ "step": 900
153
+ },
154
+ {
155
+ "epoch": 0.5889281507656066,
156
+ "grad_norm": 0.29963016510009766,
157
+ "learning_rate": 4.2752715121136177e-05,
158
+ "loss": 0.2106,
159
+ "step": 1000
160
+ },
161
+ {
162
+ "epoch": 0.5889281507656066,
163
+ "eval_loss": 0.12364959716796875,
164
+ "eval_runtime": 0.5453,
165
+ "eval_samples_per_second": 916.855,
166
+ "eval_steps_per_second": 3.667,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 0.6478209658421673,
171
+ "grad_norm": 0.21829353272914886,
172
+ "learning_rate": 4.170843776106934e-05,
173
+ "loss": 0.1869,
174
+ "step": 1100
175
+ },
176
+ {
177
+ "epoch": 0.6478209658421673,
178
+ "eval_loss": 0.11144684255123138,
179
+ "eval_runtime": 0.5369,
180
+ "eval_samples_per_second": 931.332,
181
+ "eval_steps_per_second": 3.725,
182
+ "step": 1100
183
+ },
184
+ {
185
+ "epoch": 0.7067137809187279,
186
+ "grad_norm": 0.3481610119342804,
187
+ "learning_rate": 4.066416040100251e-05,
188
+ "loss": 0.1718,
189
+ "step": 1200
190
+ },
191
+ {
192
+ "epoch": 0.7067137809187279,
193
+ "eval_loss": 0.10174528509378433,
194
+ "eval_runtime": 0.5358,
195
+ "eval_samples_per_second": 933.262,
196
+ "eval_steps_per_second": 3.733,
197
+ "step": 1200
198
+ },
199
+ {
200
+ "epoch": 0.7656065959952886,
201
+ "grad_norm": 0.20769686996936798,
202
+ "learning_rate": 3.9619883040935676e-05,
203
+ "loss": 0.1608,
204
+ "step": 1300
205
+ },
206
+ {
207
+ "epoch": 0.7656065959952886,
208
+ "eval_loss": 0.09667050093412399,
209
+ "eval_runtime": 0.5354,
210
+ "eval_samples_per_second": 933.831,
211
+ "eval_steps_per_second": 3.735,
212
+ "step": 1300
213
+ },
214
+ {
215
+ "epoch": 0.8244994110718492,
216
+ "grad_norm": 0.18638956546783447,
217
+ "learning_rate": 3.8575605680868845e-05,
218
+ "loss": 0.1527,
219
+ "step": 1400
220
+ },
221
+ {
222
+ "epoch": 0.8244994110718492,
223
+ "eval_loss": 0.09282659739255905,
224
+ "eval_runtime": 0.5367,
225
+ "eval_samples_per_second": 931.64,
226
+ "eval_steps_per_second": 3.727,
227
+ "step": 1400
228
+ },
229
+ {
230
+ "epoch": 0.8833922261484098,
231
+ "grad_norm": 0.1558378040790558,
232
+ "learning_rate": 3.753132832080201e-05,
233
+ "loss": 0.1483,
234
+ "step": 1500
235
+ },
236
+ {
237
+ "epoch": 0.8833922261484098,
238
+ "eval_loss": 0.0904233381152153,
239
+ "eval_runtime": 0.5394,
240
+ "eval_samples_per_second": 926.881,
241
+ "eval_steps_per_second": 3.708,
242
+ "step": 1500
243
+ },
244
+ {
245
+ "epoch": 0.9422850412249706,
246
+ "grad_norm": 0.15275876224040985,
247
+ "learning_rate": 3.6487050960735175e-05,
248
+ "loss": 0.1412,
249
+ "step": 1600
250
+ },
251
+ {
252
+ "epoch": 0.9422850412249706,
253
+ "eval_loss": 0.08748902380466461,
254
+ "eval_runtime": 0.5405,
255
+ "eval_samples_per_second": 925.022,
256
+ "eval_steps_per_second": 3.7,
257
+ "step": 1600
258
+ },
259
+ {
260
+ "epoch": 1.0011778563015312,
261
+ "grad_norm": 0.20647253096103668,
262
+ "learning_rate": 3.544277360066834e-05,
263
+ "loss": 0.1372,
264
+ "step": 1700
265
+ },
266
+ {
267
+ "epoch": 1.0011778563015312,
268
+ "eval_loss": 0.08554470539093018,
269
+ "eval_runtime": 0.5347,
270
+ "eval_samples_per_second": 935.021,
271
+ "eval_steps_per_second": 3.74,
272
+ "step": 1700
273
+ },
274
+ {
275
+ "epoch": 1.0600706713780919,
276
+ "grad_norm": 0.13815245032310486,
277
+ "learning_rate": 3.4398496240601506e-05,
278
+ "loss": 0.1325,
279
+ "step": 1800
280
+ },
281
+ {
282
+ "epoch": 1.0600706713780919,
283
+ "eval_loss": 0.0836854949593544,
284
+ "eval_runtime": 0.5352,
285
+ "eval_samples_per_second": 934.268,
286
+ "eval_steps_per_second": 3.737,
287
+ "step": 1800
288
+ },
289
+ {
290
+ "epoch": 1.1189634864546525,
291
+ "grad_norm": 0.14026539027690887,
292
+ "learning_rate": 3.335421888053467e-05,
293
+ "loss": 0.13,
294
+ "step": 1900
295
+ },
296
+ {
297
+ "epoch": 1.1189634864546525,
298
+ "eval_loss": 0.08203620463609695,
299
+ "eval_runtime": 0.5348,
300
+ "eval_samples_per_second": 934.996,
301
+ "eval_steps_per_second": 3.74,
302
+ "step": 1900
303
+ },
304
+ {
305
+ "epoch": 1.1778563015312131,
306
+ "grad_norm": 0.12261384725570679,
307
+ "learning_rate": 3.230994152046784e-05,
308
+ "loss": 0.1282,
309
+ "step": 2000
310
+ },
311
+ {
312
+ "epoch": 1.1778563015312131,
313
+ "eval_loss": 0.08088693022727966,
314
+ "eval_runtime": 0.539,
315
+ "eval_samples_per_second": 927.574,
316
+ "eval_steps_per_second": 3.71,
317
+ "step": 2000
318
+ },
319
+ {
320
+ "epoch": 1.2367491166077738,
321
+ "grad_norm": 0.13724654912948608,
322
+ "learning_rate": 3.1265664160401006e-05,
323
+ "loss": 0.1247,
324
+ "step": 2100
325
+ },
326
+ {
327
+ "epoch": 1.2367491166077738,
328
+ "eval_loss": 0.07952813804149628,
329
+ "eval_runtime": 0.5404,
330
+ "eval_samples_per_second": 925.283,
331
+ "eval_steps_per_second": 3.701,
332
+ "step": 2100
333
+ },
334
+ {
335
+ "epoch": 1.2956419316843344,
336
+ "grad_norm": 0.17809857428073883,
337
+ "learning_rate": 3.022138680033417e-05,
338
+ "loss": 0.1238,
339
+ "step": 2200
340
+ },
341
+ {
342
+ "epoch": 1.2956419316843344,
343
+ "eval_loss": 0.07879804819822311,
344
+ "eval_runtime": 0.5341,
345
+ "eval_samples_per_second": 936.116,
346
+ "eval_steps_per_second": 3.744,
347
+ "step": 2200
348
+ },
349
+ {
350
+ "epoch": 1.3545347467608952,
351
+ "grad_norm": 0.11621029675006866,
352
+ "learning_rate": 2.9177109440267336e-05,
353
+ "loss": 0.1209,
354
+ "step": 2300
355
+ },
356
+ {
357
+ "epoch": 1.3545347467608952,
358
+ "eval_loss": 0.07757514715194702,
359
+ "eval_runtime": 0.5351,
360
+ "eval_samples_per_second": 934.37,
361
+ "eval_steps_per_second": 3.737,
362
+ "step": 2300
363
+ },
364
+ {
365
+ "epoch": 1.4134275618374559,
366
+ "grad_norm": 0.12245041131973267,
367
+ "learning_rate": 2.8132832080200505e-05,
368
+ "loss": 0.1166,
369
+ "step": 2400
370
+ },
371
+ {
372
+ "epoch": 1.4134275618374559,
373
+ "eval_loss": 0.07674014568328857,
374
+ "eval_runtime": 0.5335,
375
+ "eval_samples_per_second": 937.176,
376
+ "eval_steps_per_second": 3.749,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 1.4723203769140165,
381
+ "grad_norm": 0.11454136669635773,
382
+ "learning_rate": 2.7088554720133667e-05,
383
+ "loss": 0.1189,
384
+ "step": 2500
385
+ },
386
+ {
387
+ "epoch": 1.4723203769140165,
388
+ "eval_loss": 0.0758647620677948,
389
+ "eval_runtime": 0.5451,
390
+ "eval_samples_per_second": 917.286,
391
+ "eval_steps_per_second": 3.669,
392
+ "step": 2500
393
+ },
394
+ {
395
+ "epoch": 1.5312131919905771,
396
+ "grad_norm": 0.10498815774917603,
397
+ "learning_rate": 2.604427736006684e-05,
398
+ "loss": 0.1154,
399
+ "step": 2600
400
+ },
401
+ {
402
+ "epoch": 1.5312131919905771,
403
+ "eval_loss": 0.07501054555177689,
404
+ "eval_runtime": 0.5354,
405
+ "eval_samples_per_second": 933.938,
406
+ "eval_steps_per_second": 3.736,
407
+ "step": 2600
408
+ },
409
+ {
410
+ "epoch": 1.5901060070671378,
411
+ "grad_norm": 0.1041310578584671,
412
+ "learning_rate": 2.5e-05,
413
+ "loss": 0.1145,
414
+ "step": 2700
415
+ },
416
+ {
417
+ "epoch": 1.5901060070671378,
418
+ "eval_loss": 0.07420270144939423,
419
+ "eval_runtime": 0.5438,
420
+ "eval_samples_per_second": 919.476,
421
+ "eval_steps_per_second": 3.678,
422
+ "step": 2700
423
+ },
424
+ {
425
+ "epoch": 1.6489988221436984,
426
+ "grad_norm": 0.14854931831359863,
427
+ "learning_rate": 2.3955722639933167e-05,
428
+ "loss": 0.1123,
429
+ "step": 2800
430
+ },
431
+ {
432
+ "epoch": 1.6489988221436984,
433
+ "eval_loss": 0.07348344475030899,
434
+ "eval_runtime": 0.5481,
435
+ "eval_samples_per_second": 912.284,
436
+ "eval_steps_per_second": 3.649,
437
+ "step": 2800
438
+ },
439
+ {
440
+ "epoch": 1.7078916372202593,
441
+ "grad_norm": 0.10635272413492203,
442
+ "learning_rate": 2.2911445279866335e-05,
443
+ "loss": 0.1127,
444
+ "step": 2900
445
+ },
446
+ {
447
+ "epoch": 1.7078916372202593,
448
+ "eval_loss": 0.07267069816589355,
449
+ "eval_runtime": 0.5354,
450
+ "eval_samples_per_second": 933.878,
451
+ "eval_steps_per_second": 3.736,
452
+ "step": 2900
453
+ },
454
+ {
455
+ "epoch": 1.76678445229682,
456
+ "grad_norm": 0.11862709373235703,
457
+ "learning_rate": 2.18671679197995e-05,
458
+ "loss": 0.1109,
459
+ "step": 3000
460
+ },
461
+ {
462
+ "epoch": 1.76678445229682,
463
+ "eval_loss": 0.07196911424398422,
464
+ "eval_runtime": 0.5364,
465
+ "eval_samples_per_second": 932.187,
466
+ "eval_steps_per_second": 3.729,
467
+ "step": 3000
468
+ },
469
+ {
470
+ "epoch": 1.8256772673733805,
471
+ "grad_norm": 0.11262491345405579,
472
+ "learning_rate": 2.0822890559732666e-05,
473
+ "loss": 0.11,
474
+ "step": 3100
475
+ },
476
+ {
477
+ "epoch": 1.8256772673733805,
478
+ "eval_loss": 0.0713561549782753,
479
+ "eval_runtime": 0.5407,
480
+ "eval_samples_per_second": 924.79,
481
+ "eval_steps_per_second": 3.699,
482
+ "step": 3100
483
+ },
484
+ {
485
+ "epoch": 1.8845700824499412,
486
+ "grad_norm": 0.17745310068130493,
487
+ "learning_rate": 1.977861319966583e-05,
488
+ "loss": 0.1084,
489
+ "step": 3200
490
+ },
491
+ {
492
+ "epoch": 1.8845700824499412,
493
+ "eval_loss": 0.07110374420881271,
494
+ "eval_runtime": 0.537,
495
+ "eval_samples_per_second": 931.079,
496
+ "eval_steps_per_second": 3.724,
497
+ "step": 3200
498
+ },
499
+ {
500
+ "epoch": 1.9434628975265018,
501
+ "grad_norm": 0.12771931290626526,
502
+ "learning_rate": 1.8734335839599e-05,
503
+ "loss": 0.1083,
504
+ "step": 3300
505
+ },
506
+ {
507
+ "epoch": 1.9434628975265018,
508
+ "eval_loss": 0.07047487050294876,
509
+ "eval_runtime": 0.5416,
510
+ "eval_samples_per_second": 923.161,
511
+ "eval_steps_per_second": 3.693,
512
+ "step": 3300
513
+ }
514
+ ],
515
+ "logging_steps": 100,
516
+ "max_steps": 5094,
517
+ "num_input_tokens_seen": 0,
518
+ "num_train_epochs": 3,
519
+ "save_steps": 500,
520
+ "stateful_callbacks": {
521
+ "TrainerControl": {
522
+ "args": {
523
+ "should_epoch_stop": false,
524
+ "should_evaluate": false,
525
+ "should_log": false,
526
+ "should_save": true,
527
+ "should_training_stop": false
528
+ },
529
+ "attributes": {}
530
+ }
531
+ },
532
+ "total_flos": 2.2439819631132672e+17,
533
+ "train_batch_size": 250,
534
+ "trial_name": null,
535
+ "trial_params": null
536
+ }
mt5-finetuned/checkpoint-3396/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338671f41322537cc4ced53c635facb13cc93423bbd52c98d16f5afbe30f2376
3
+ size 5304
mt5-finetuned/checkpoint-5094/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MT5ForConditionalGeneration"
4
+ ],
5
+ "classifier_dropout": 0.0,
6
+ "d_ff": 1024,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 8,
20
+ "num_heads": 6,
21
+ "num_layers": 8,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "tokenizer_class": "T5Tokenizer",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.50.3",
29
+ "use_cache": true,
30
+ "vocab_size": 250112
31
+ }
mt5-finetuned/checkpoint-5094/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.50.3"
7
+ }
mt5-finetuned/checkpoint-5094/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0529a72bcfa99e852266c278f9fe2be272ea49629b4ec8734831136e6bc4645
3
+ size 1200729512
mt5-finetuned/checkpoint-5094/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5b8bdc4d47ec94aa1c9b9cb5e3c924ce4111bd634796a8aed15fcf87899535
3
+ size 2401574330
mt5-finetuned/checkpoint-5094/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c5cb50e12211e8ff6f13dcf72c4e0877e4918fa1a1ef677fddd3262f3553d4
3
+ size 14244
mt5-finetuned/checkpoint-5094/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a510cef9a5cd55b582c9172902c5ea4d5b7dba6acfef03beb1323ced783e71a
3
+ size 1064
mt5-finetuned/checkpoint-5094/trainer_state.json ADDED
@@ -0,0 +1,791 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 5094,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005889281507656066,
14
+ "grad_norm": 1069.6029052734375,
15
+ "learning_rate": 1.6339869281045752e-07,
16
+ "loss": 61.3399,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.05889281507656066,
21
+ "grad_norm": 295.9472961425781,
22
+ "learning_rate": 1.6339869281045753e-05,
23
+ "loss": 55.7859,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.05889281507656066,
28
+ "eval_loss": 34.09705352783203,
29
+ "eval_runtime": 0.5351,
30
+ "eval_samples_per_second": 934.481,
31
+ "eval_steps_per_second": 3.738,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.11778563015312132,
36
+ "grad_norm": 397.3686218261719,
37
+ "learning_rate": 3.2679738562091506e-05,
38
+ "loss": 36.4484,
39
+ "step": 200
40
+ },
41
+ {
42
+ "epoch": 0.11778563015312132,
43
+ "eval_loss": 18.607961654663086,
44
+ "eval_runtime": 0.5345,
45
+ "eval_samples_per_second": 935.528,
46
+ "eval_steps_per_second": 3.742,
47
+ "step": 200
48
+ },
49
+ {
50
+ "epoch": 0.17667844522968199,
51
+ "grad_norm": 91.78496551513672,
52
+ "learning_rate": 4.901960784313725e-05,
53
+ "loss": 19.2835,
54
+ "step": 300
55
+ },
56
+ {
57
+ "epoch": 0.17667844522968199,
58
+ "eval_loss": 8.0890474319458,
59
+ "eval_runtime": 0.542,
60
+ "eval_samples_per_second": 922.533,
61
+ "eval_steps_per_second": 3.69,
62
+ "step": 300
63
+ },
64
+ {
65
+ "epoch": 0.23557126030624265,
66
+ "grad_norm": 25.501201629638672,
67
+ "learning_rate": 4.901837928153718e-05,
68
+ "loss": 7.4464,
69
+ "step": 400
70
+ },
71
+ {
72
+ "epoch": 0.23557126030624265,
73
+ "eval_loss": 3.304853677749634,
74
+ "eval_runtime": 0.5365,
75
+ "eval_samples_per_second": 931.913,
76
+ "eval_steps_per_second": 3.728,
77
+ "step": 400
78
+ },
79
+ {
80
+ "epoch": 0.2944640753828033,
81
+ "grad_norm": 23.07789421081543,
82
+ "learning_rate": 4.7974101921470346e-05,
83
+ "loss": 2.2278,
84
+ "step": 500
85
+ },
86
+ {
87
+ "epoch": 0.2944640753828033,
88
+ "eval_loss": 0.6309428811073303,
89
+ "eval_runtime": 0.534,
90
+ "eval_samples_per_second": 936.293,
91
+ "eval_steps_per_second": 3.745,
92
+ "step": 500
93
+ },
94
+ {
95
+ "epoch": 0.35335689045936397,
96
+ "grad_norm": 6.330116271972656,
97
+ "learning_rate": 4.6929824561403515e-05,
98
+ "loss": 0.7991,
99
+ "step": 600
100
+ },
101
+ {
102
+ "epoch": 0.35335689045936397,
103
+ "eval_loss": 0.39563897252082825,
104
+ "eval_runtime": 0.5351,
105
+ "eval_samples_per_second": 934.394,
106
+ "eval_steps_per_second": 3.738,
107
+ "step": 600
108
+ },
109
+ {
110
+ "epoch": 0.4122497055359246,
111
+ "grad_norm": 0.8220232129096985,
112
+ "learning_rate": 4.588554720133668e-05,
113
+ "loss": 0.4644,
114
+ "step": 700
115
+ },
116
+ {
117
+ "epoch": 0.4122497055359246,
118
+ "eval_loss": 0.22928042709827423,
119
+ "eval_runtime": 0.545,
120
+ "eval_samples_per_second": 917.424,
121
+ "eval_steps_per_second": 3.67,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 0.4711425206124853,
126
+ "grad_norm": 0.47425171732902527,
127
+ "learning_rate": 4.4841269841269846e-05,
128
+ "loss": 0.3034,
129
+ "step": 800
130
+ },
131
+ {
132
+ "epoch": 0.4711425206124853,
133
+ "eval_loss": 0.15583929419517517,
134
+ "eval_runtime": 0.5482,
135
+ "eval_samples_per_second": 912.069,
136
+ "eval_steps_per_second": 3.648,
137
+ "step": 800
138
+ },
139
+ {
140
+ "epoch": 0.5300353356890459,
141
+ "grad_norm": 0.33976656198501587,
142
+ "learning_rate": 4.379699248120301e-05,
143
+ "loss": 0.2374,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 0.5300353356890459,
148
+ "eval_loss": 0.13674509525299072,
149
+ "eval_runtime": 0.5433,
150
+ "eval_samples_per_second": 920.268,
151
+ "eval_steps_per_second": 3.681,
152
+ "step": 900
153
+ },
154
+ {
155
+ "epoch": 0.5889281507656066,
156
+ "grad_norm": 0.29963016510009766,
157
+ "learning_rate": 4.2752715121136177e-05,
158
+ "loss": 0.2106,
159
+ "step": 1000
160
+ },
161
+ {
162
+ "epoch": 0.5889281507656066,
163
+ "eval_loss": 0.12364959716796875,
164
+ "eval_runtime": 0.5453,
165
+ "eval_samples_per_second": 916.855,
166
+ "eval_steps_per_second": 3.667,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 0.6478209658421673,
171
+ "grad_norm": 0.21829353272914886,
172
+ "learning_rate": 4.170843776106934e-05,
173
+ "loss": 0.1869,
174
+ "step": 1100
175
+ },
176
+ {
177
+ "epoch": 0.6478209658421673,
178
+ "eval_loss": 0.11144684255123138,
179
+ "eval_runtime": 0.5369,
180
+ "eval_samples_per_second": 931.332,
181
+ "eval_steps_per_second": 3.725,
182
+ "step": 1100
183
+ },
184
+ {
185
+ "epoch": 0.7067137809187279,
186
+ "grad_norm": 0.3481610119342804,
187
+ "learning_rate": 4.066416040100251e-05,
188
+ "loss": 0.1718,
189
+ "step": 1200
190
+ },
191
+ {
192
+ "epoch": 0.7067137809187279,
193
+ "eval_loss": 0.10174528509378433,
194
+ "eval_runtime": 0.5358,
195
+ "eval_samples_per_second": 933.262,
196
+ "eval_steps_per_second": 3.733,
197
+ "step": 1200
198
+ },
199
+ {
200
+ "epoch": 0.7656065959952886,
201
+ "grad_norm": 0.20769686996936798,
202
+ "learning_rate": 3.9619883040935676e-05,
203
+ "loss": 0.1608,
204
+ "step": 1300
205
+ },
206
+ {
207
+ "epoch": 0.7656065959952886,
208
+ "eval_loss": 0.09667050093412399,
209
+ "eval_runtime": 0.5354,
210
+ "eval_samples_per_second": 933.831,
211
+ "eval_steps_per_second": 3.735,
212
+ "step": 1300
213
+ },
214
+ {
215
+ "epoch": 0.8244994110718492,
216
+ "grad_norm": 0.18638956546783447,
217
+ "learning_rate": 3.8575605680868845e-05,
218
+ "loss": 0.1527,
219
+ "step": 1400
220
+ },
221
+ {
222
+ "epoch": 0.8244994110718492,
223
+ "eval_loss": 0.09282659739255905,
224
+ "eval_runtime": 0.5367,
225
+ "eval_samples_per_second": 931.64,
226
+ "eval_steps_per_second": 3.727,
227
+ "step": 1400
228
+ },
229
+ {
230
+ "epoch": 0.8833922261484098,
231
+ "grad_norm": 0.1558378040790558,
232
+ "learning_rate": 3.753132832080201e-05,
233
+ "loss": 0.1483,
234
+ "step": 1500
235
+ },
236
+ {
237
+ "epoch": 0.8833922261484098,
238
+ "eval_loss": 0.0904233381152153,
239
+ "eval_runtime": 0.5394,
240
+ "eval_samples_per_second": 926.881,
241
+ "eval_steps_per_second": 3.708,
242
+ "step": 1500
243
+ },
244
+ {
245
+ "epoch": 0.9422850412249706,
246
+ "grad_norm": 0.15275876224040985,
247
+ "learning_rate": 3.6487050960735175e-05,
248
+ "loss": 0.1412,
249
+ "step": 1600
250
+ },
251
+ {
252
+ "epoch": 0.9422850412249706,
253
+ "eval_loss": 0.08748902380466461,
254
+ "eval_runtime": 0.5405,
255
+ "eval_samples_per_second": 925.022,
256
+ "eval_steps_per_second": 3.7,
257
+ "step": 1600
258
+ },
259
+ {
260
+ "epoch": 1.0011778563015312,
261
+ "grad_norm": 0.20647253096103668,
262
+ "learning_rate": 3.544277360066834e-05,
263
+ "loss": 0.1372,
264
+ "step": 1700
265
+ },
266
+ {
267
+ "epoch": 1.0011778563015312,
268
+ "eval_loss": 0.08554470539093018,
269
+ "eval_runtime": 0.5347,
270
+ "eval_samples_per_second": 935.021,
271
+ "eval_steps_per_second": 3.74,
272
+ "step": 1700
273
+ },
274
+ {
275
+ "epoch": 1.0600706713780919,
276
+ "grad_norm": 0.13815245032310486,
277
+ "learning_rate": 3.4398496240601506e-05,
278
+ "loss": 0.1325,
279
+ "step": 1800
280
+ },
281
+ {
282
+ "epoch": 1.0600706713780919,
283
+ "eval_loss": 0.0836854949593544,
284
+ "eval_runtime": 0.5352,
285
+ "eval_samples_per_second": 934.268,
286
+ "eval_steps_per_second": 3.737,
287
+ "step": 1800
288
+ },
289
+ {
290
+ "epoch": 1.1189634864546525,
291
+ "grad_norm": 0.14026539027690887,
292
+ "learning_rate": 3.335421888053467e-05,
293
+ "loss": 0.13,
294
+ "step": 1900
295
+ },
296
+ {
297
+ "epoch": 1.1189634864546525,
298
+ "eval_loss": 0.08203620463609695,
299
+ "eval_runtime": 0.5348,
300
+ "eval_samples_per_second": 934.996,
301
+ "eval_steps_per_second": 3.74,
302
+ "step": 1900
303
+ },
304
+ {
305
+ "epoch": 1.1778563015312131,
306
+ "grad_norm": 0.12261384725570679,
307
+ "learning_rate": 3.230994152046784e-05,
308
+ "loss": 0.1282,
309
+ "step": 2000
310
+ },
311
+ {
312
+ "epoch": 1.1778563015312131,
313
+ "eval_loss": 0.08088693022727966,
314
+ "eval_runtime": 0.539,
315
+ "eval_samples_per_second": 927.574,
316
+ "eval_steps_per_second": 3.71,
317
+ "step": 2000
318
+ },
319
+ {
320
+ "epoch": 1.2367491166077738,
321
+ "grad_norm": 0.13724654912948608,
322
+ "learning_rate": 3.1265664160401006e-05,
323
+ "loss": 0.1247,
324
+ "step": 2100
325
+ },
326
+ {
327
+ "epoch": 1.2367491166077738,
328
+ "eval_loss": 0.07952813804149628,
329
+ "eval_runtime": 0.5404,
330
+ "eval_samples_per_second": 925.283,
331
+ "eval_steps_per_second": 3.701,
332
+ "step": 2100
333
+ },
334
+ {
335
+ "epoch": 1.2956419316843344,
336
+ "grad_norm": 0.17809857428073883,
337
+ "learning_rate": 3.022138680033417e-05,
338
+ "loss": 0.1238,
339
+ "step": 2200
340
+ },
341
+ {
342
+ "epoch": 1.2956419316843344,
343
+ "eval_loss": 0.07879804819822311,
344
+ "eval_runtime": 0.5341,
345
+ "eval_samples_per_second": 936.116,
346
+ "eval_steps_per_second": 3.744,
347
+ "step": 2200
348
+ },
349
+ {
350
+ "epoch": 1.3545347467608952,
351
+ "grad_norm": 0.11621029675006866,
352
+ "learning_rate": 2.9177109440267336e-05,
353
+ "loss": 0.1209,
354
+ "step": 2300
355
+ },
356
+ {
357
+ "epoch": 1.3545347467608952,
358
+ "eval_loss": 0.07757514715194702,
359
+ "eval_runtime": 0.5351,
360
+ "eval_samples_per_second": 934.37,
361
+ "eval_steps_per_second": 3.737,
362
+ "step": 2300
363
+ },
364
+ {
365
+ "epoch": 1.4134275618374559,
366
+ "grad_norm": 0.12245041131973267,
367
+ "learning_rate": 2.8132832080200505e-05,
368
+ "loss": 0.1166,
369
+ "step": 2400
370
+ },
371
+ {
372
+ "epoch": 1.4134275618374559,
373
+ "eval_loss": 0.07674014568328857,
374
+ "eval_runtime": 0.5335,
375
+ "eval_samples_per_second": 937.176,
376
+ "eval_steps_per_second": 3.749,
377
+ "step": 2400
378
+ },
379
+ {
380
+ "epoch": 1.4723203769140165,
381
+ "grad_norm": 0.11454136669635773,
382
+ "learning_rate": 2.7088554720133667e-05,
383
+ "loss": 0.1189,
384
+ "step": 2500
385
+ },
386
+ {
387
+ "epoch": 1.4723203769140165,
388
+ "eval_loss": 0.0758647620677948,
389
+ "eval_runtime": 0.5451,
390
+ "eval_samples_per_second": 917.286,
391
+ "eval_steps_per_second": 3.669,
392
+ "step": 2500
393
+ },
394
+ {
395
+ "epoch": 1.5312131919905771,
396
+ "grad_norm": 0.10498815774917603,
397
+ "learning_rate": 2.604427736006684e-05,
398
+ "loss": 0.1154,
399
+ "step": 2600
400
+ },
401
+ {
402
+ "epoch": 1.5312131919905771,
403
+ "eval_loss": 0.07501054555177689,
404
+ "eval_runtime": 0.5354,
405
+ "eval_samples_per_second": 933.938,
406
+ "eval_steps_per_second": 3.736,
407
+ "step": 2600
408
+ },
409
+ {
410
+ "epoch": 1.5901060070671378,
411
+ "grad_norm": 0.1041310578584671,
412
+ "learning_rate": 2.5e-05,
413
+ "loss": 0.1145,
414
+ "step": 2700
415
+ },
416
+ {
417
+ "epoch": 1.5901060070671378,
418
+ "eval_loss": 0.07420270144939423,
419
+ "eval_runtime": 0.5438,
420
+ "eval_samples_per_second": 919.476,
421
+ "eval_steps_per_second": 3.678,
422
+ "step": 2700
423
+ },
424
+ {
425
+ "epoch": 1.6489988221436984,
426
+ "grad_norm": 0.14854931831359863,
427
+ "learning_rate": 2.3955722639933167e-05,
428
+ "loss": 0.1123,
429
+ "step": 2800
430
+ },
431
+ {
432
+ "epoch": 1.6489988221436984,
433
+ "eval_loss": 0.07348344475030899,
434
+ "eval_runtime": 0.5481,
435
+ "eval_samples_per_second": 912.284,
436
+ "eval_steps_per_second": 3.649,
437
+ "step": 2800
438
+ },
439
+ {
440
+ "epoch": 1.7078916372202593,
441
+ "grad_norm": 0.10635272413492203,
442
+ "learning_rate": 2.2911445279866335e-05,
443
+ "loss": 0.1127,
444
+ "step": 2900
445
+ },
446
+ {
447
+ "epoch": 1.7078916372202593,
448
+ "eval_loss": 0.07267069816589355,
449
+ "eval_runtime": 0.5354,
450
+ "eval_samples_per_second": 933.878,
451
+ "eval_steps_per_second": 3.736,
452
+ "step": 2900
453
+ },
454
+ {
455
+ "epoch": 1.76678445229682,
456
+ "grad_norm": 0.11862709373235703,
457
+ "learning_rate": 2.18671679197995e-05,
458
+ "loss": 0.1109,
459
+ "step": 3000
460
+ },
461
+ {
462
+ "epoch": 1.76678445229682,
463
+ "eval_loss": 0.07196911424398422,
464
+ "eval_runtime": 0.5364,
465
+ "eval_samples_per_second": 932.187,
466
+ "eval_steps_per_second": 3.729,
467
+ "step": 3000
468
+ },
469
+ {
470
+ "epoch": 1.8256772673733805,
471
+ "grad_norm": 0.11262491345405579,
472
+ "learning_rate": 2.0822890559732666e-05,
473
+ "loss": 0.11,
474
+ "step": 3100
475
+ },
476
+ {
477
+ "epoch": 1.8256772673733805,
478
+ "eval_loss": 0.0713561549782753,
479
+ "eval_runtime": 0.5407,
480
+ "eval_samples_per_second": 924.79,
481
+ "eval_steps_per_second": 3.699,
482
+ "step": 3100
483
+ },
484
+ {
485
+ "epoch": 1.8845700824499412,
486
+ "grad_norm": 0.17745310068130493,
487
+ "learning_rate": 1.977861319966583e-05,
488
+ "loss": 0.1084,
489
+ "step": 3200
490
+ },
491
+ {
492
+ "epoch": 1.8845700824499412,
493
+ "eval_loss": 0.07110374420881271,
494
+ "eval_runtime": 0.537,
495
+ "eval_samples_per_second": 931.079,
496
+ "eval_steps_per_second": 3.724,
497
+ "step": 3200
498
+ },
499
+ {
500
+ "epoch": 1.9434628975265018,
501
+ "grad_norm": 0.12771931290626526,
502
+ "learning_rate": 1.8734335839599e-05,
503
+ "loss": 0.1083,
504
+ "step": 3300
505
+ },
506
+ {
507
+ "epoch": 1.9434628975265018,
508
+ "eval_loss": 0.07047487050294876,
509
+ "eval_runtime": 0.5416,
510
+ "eval_samples_per_second": 923.161,
511
+ "eval_steps_per_second": 3.693,
512
+ "step": 3300
513
+ },
514
+ {
515
+ "epoch": 2.0023557126030624,
516
+ "grad_norm": 0.10948721319437027,
517
+ "learning_rate": 1.7690058479532165e-05,
518
+ "loss": 0.1087,
519
+ "step": 3400
520
+ },
521
+ {
522
+ "epoch": 2.0023557126030624,
523
+ "eval_loss": 0.06981312483549118,
524
+ "eval_runtime": 0.5441,
525
+ "eval_samples_per_second": 918.969,
526
+ "eval_steps_per_second": 3.676,
527
+ "step": 3400
528
+ },
529
+ {
530
+ "epoch": 2.061248527679623,
531
+ "grad_norm": 0.10693700611591339,
532
+ "learning_rate": 1.664578111946533e-05,
533
+ "loss": 0.1058,
534
+ "step": 3500
535
+ },
536
+ {
537
+ "epoch": 2.061248527679623,
538
+ "eval_loss": 0.06955926865339279,
539
+ "eval_runtime": 0.5359,
540
+ "eval_samples_per_second": 933.089,
541
+ "eval_steps_per_second": 3.732,
542
+ "step": 3500
543
+ },
544
+ {
545
+ "epoch": 2.1201413427561837,
546
+ "grad_norm": 0.13173066079616547,
547
+ "learning_rate": 1.5601503759398496e-05,
548
+ "loss": 0.1053,
549
+ "step": 3600
550
+ },
551
+ {
552
+ "epoch": 2.1201413427561837,
553
+ "eval_loss": 0.06913256645202637,
554
+ "eval_runtime": 0.5364,
555
+ "eval_samples_per_second": 932.186,
556
+ "eval_steps_per_second": 3.729,
557
+ "step": 3600
558
+ },
559
+ {
560
+ "epoch": 2.1790341578327443,
561
+ "grad_norm": 0.09946483373641968,
562
+ "learning_rate": 1.4557226399331663e-05,
563
+ "loss": 0.1051,
564
+ "step": 3700
565
+ },
566
+ {
567
+ "epoch": 2.1790341578327443,
568
+ "eval_loss": 0.06875628232955933,
569
+ "eval_runtime": 0.5347,
570
+ "eval_samples_per_second": 935.082,
571
+ "eval_steps_per_second": 3.74,
572
+ "step": 3700
573
+ },
574
+ {
575
+ "epoch": 2.237926972909305,
576
+ "grad_norm": 0.12029112130403519,
577
+ "learning_rate": 1.351294903926483e-05,
578
+ "loss": 0.1044,
579
+ "step": 3800
580
+ },
581
+ {
582
+ "epoch": 2.237926972909305,
583
+ "eval_loss": 0.06848787516355515,
584
+ "eval_runtime": 0.539,
585
+ "eval_samples_per_second": 927.692,
586
+ "eval_steps_per_second": 3.711,
587
+ "step": 3800
588
+ },
589
+ {
590
+ "epoch": 2.2968197879858656,
591
+ "grad_norm": 0.11469805240631104,
592
+ "learning_rate": 1.2468671679197996e-05,
593
+ "loss": 0.1027,
594
+ "step": 3900
595
+ },
596
+ {
597
+ "epoch": 2.2968197879858656,
598
+ "eval_loss": 0.06818344444036484,
599
+ "eval_runtime": 0.5338,
600
+ "eval_samples_per_second": 936.719,
601
+ "eval_steps_per_second": 3.747,
602
+ "step": 3900
603
+ },
604
+ {
605
+ "epoch": 2.3557126030624262,
606
+ "grad_norm": 0.09450303018093109,
607
+ "learning_rate": 1.1424394319131161e-05,
608
+ "loss": 0.1038,
609
+ "step": 4000
610
+ },
611
+ {
612
+ "epoch": 2.3557126030624262,
613
+ "eval_loss": 0.06780162453651428,
614
+ "eval_runtime": 0.5352,
615
+ "eval_samples_per_second": 934.248,
616
+ "eval_steps_per_second": 3.737,
617
+ "step": 4000
618
+ },
619
+ {
620
+ "epoch": 2.414605418138987,
621
+ "grad_norm": 0.09121797233819962,
622
+ "learning_rate": 1.0380116959064328e-05,
623
+ "loss": 0.1033,
624
+ "step": 4100
625
+ },
626
+ {
627
+ "epoch": 2.414605418138987,
628
+ "eval_loss": 0.06764357537031174,
629
+ "eval_runtime": 0.537,
630
+ "eval_samples_per_second": 931.05,
631
+ "eval_steps_per_second": 3.724,
632
+ "step": 4100
633
+ },
634
+ {
635
+ "epoch": 2.4734982332155475,
636
+ "grad_norm": 0.10826277732849121,
637
+ "learning_rate": 9.335839598997493e-06,
638
+ "loss": 0.103,
639
+ "step": 4200
640
+ },
641
+ {
642
+ "epoch": 2.4734982332155475,
643
+ "eval_loss": 0.06742237508296967,
644
+ "eval_runtime": 0.5439,
645
+ "eval_samples_per_second": 919.306,
646
+ "eval_steps_per_second": 3.677,
647
+ "step": 4200
648
+ },
649
+ {
650
+ "epoch": 2.5323910482921086,
651
+ "grad_norm": 0.10901422053575516,
652
+ "learning_rate": 8.29156223893066e-06,
653
+ "loss": 0.1008,
654
+ "step": 4300
655
+ },
656
+ {
657
+ "epoch": 2.5323910482921086,
658
+ "eval_loss": 0.06723643094301224,
659
+ "eval_runtime": 0.5356,
660
+ "eval_samples_per_second": 933.609,
661
+ "eval_steps_per_second": 3.734,
662
+ "step": 4300
663
+ },
664
+ {
665
+ "epoch": 2.591283863368669,
666
+ "grad_norm": 0.09686878323554993,
667
+ "learning_rate": 7.247284878863826e-06,
668
+ "loss": 0.1012,
669
+ "step": 4400
670
+ },
671
+ {
672
+ "epoch": 2.591283863368669,
673
+ "eval_loss": 0.06717756390571594,
674
+ "eval_runtime": 0.5349,
675
+ "eval_samples_per_second": 934.686,
676
+ "eval_steps_per_second": 3.739,
677
+ "step": 4400
678
+ },
679
+ {
680
+ "epoch": 2.65017667844523,
681
+ "grad_norm": 0.1727057844400406,
682
+ "learning_rate": 6.203007518796992e-06,
683
+ "loss": 0.1017,
684
+ "step": 4500
685
+ },
686
+ {
687
+ "epoch": 2.65017667844523,
688
+ "eval_loss": 0.06697698682546616,
689
+ "eval_runtime": 0.5491,
690
+ "eval_samples_per_second": 910.517,
691
+ "eval_steps_per_second": 3.642,
692
+ "step": 4500
693
+ },
694
+ {
695
+ "epoch": 2.7090694935217905,
696
+ "grad_norm": 0.1121884360909462,
697
+ "learning_rate": 5.158730158730159e-06,
698
+ "loss": 0.1004,
699
+ "step": 4600
700
+ },
701
+ {
702
+ "epoch": 2.7090694935217905,
703
+ "eval_loss": 0.06690937280654907,
704
+ "eval_runtime": 0.5371,
705
+ "eval_samples_per_second": 930.912,
706
+ "eval_steps_per_second": 3.724,
707
+ "step": 4600
708
+ },
709
+ {
710
+ "epoch": 2.767962308598351,
711
+ "grad_norm": 0.08828947693109512,
712
+ "learning_rate": 4.114452798663325e-06,
713
+ "loss": 0.0998,
714
+ "step": 4700
715
+ },
716
+ {
717
+ "epoch": 2.767962308598351,
718
+ "eval_loss": 0.06677506864070892,
719
+ "eval_runtime": 0.5482,
720
+ "eval_samples_per_second": 912.01,
721
+ "eval_steps_per_second": 3.648,
722
+ "step": 4700
723
+ },
724
+ {
725
+ "epoch": 2.8268551236749118,
726
+ "grad_norm": 0.09760947525501251,
727
+ "learning_rate": 3.070175438596491e-06,
728
+ "loss": 0.0989,
729
+ "step": 4800
730
+ },
731
+ {
732
+ "epoch": 2.8268551236749118,
733
+ "eval_loss": 0.0667320117354393,
734
+ "eval_runtime": 0.5435,
735
+ "eval_samples_per_second": 920.015,
736
+ "eval_steps_per_second": 3.68,
737
+ "step": 4800
738
+ },
739
+ {
740
+ "epoch": 2.8857479387514724,
741
+ "grad_norm": 0.0929727628827095,
742
+ "learning_rate": 2.0258980785296573e-06,
743
+ "loss": 0.1019,
744
+ "step": 4900
745
+ },
746
+ {
747
+ "epoch": 2.8857479387514724,
748
+ "eval_loss": 0.06665968149900436,
749
+ "eval_runtime": 0.536,
750
+ "eval_samples_per_second": 932.893,
751
+ "eval_steps_per_second": 3.732,
752
+ "step": 4900
753
+ },
754
+ {
755
+ "epoch": 2.944640753828033,
756
+ "grad_norm": 0.12202879041433334,
757
+ "learning_rate": 9.816207184628237e-07,
758
+ "loss": 0.098,
759
+ "step": 5000
760
+ },
761
+ {
762
+ "epoch": 2.944640753828033,
763
+ "eval_loss": 0.06658908724784851,
764
+ "eval_runtime": 0.5374,
765
+ "eval_samples_per_second": 930.358,
766
+ "eval_steps_per_second": 3.721,
767
+ "step": 5000
768
+ }
769
+ ],
770
+ "logging_steps": 100,
771
+ "max_steps": 5094,
772
+ "num_input_tokens_seen": 0,
773
+ "num_train_epochs": 3,
774
+ "save_steps": 500,
775
+ "stateful_callbacks": {
776
+ "TrainerControl": {
777
+ "args": {
778
+ "should_epoch_stop": false,
779
+ "should_evaluate": false,
780
+ "should_log": false,
781
+ "should_save": true,
782
+ "should_training_stop": true
783
+ },
784
+ "attributes": {}
785
+ }
786
+ },
787
+ "total_flos": 3.365972944669901e+17,
788
+ "train_batch_size": 250,
789
+ "trial_name": null,
790
+ "trial_params": null
791
+ }
mt5-finetuned/checkpoint-5094/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338671f41322537cc4ced53c635facb13cc93423bbd52c98d16f5afbe30f2376
3
+ size 5304
mt5-finetuned/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MT5ForConditionalGeneration"
4
+ ],
5
+ "classifier_dropout": 0.0,
6
+ "d_ff": 1024,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 8,
20
+ "num_heads": 6,
21
+ "num_layers": 8,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "tokenizer_class": "T5Tokenizer",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.50.3",
29
+ "use_cache": true,
30
+ "vocab_size": 250112
31
+ }
mt5-finetuned/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.50.3"
7
+ }
mt5-finetuned/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0529a72bcfa99e852266c278f9fe2be272ea49629b4ec8734831136e6bc4645
3
+ size 1200729512
mt5-finetuned/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<pad>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
mt5-finetuned/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
mt5-finetuned/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "</s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<unk>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [],
30
+ "clean_up_tokenization_spaces": false,
31
+ "eos_token": "</s>",
32
+ "extra_ids": 0,
33
+ "extra_special_tokens": {},
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "<pad>",
37
+ "sp_model_kwargs": {},
38
+ "tokenizer_class": "MT5Tokenizer",
39
+ "unk_token": "<unk>"
40
+ }
mt5_training_data-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efac62c55256372f42df4573729c7d4e0dce2c0046d7572b9712fcffc6c1e9aa
3
+ size 96595844
mt5_validation_data-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ sentencepiece
3
+ datasets
4
+ safetensors
5
+ torch
6
+ accelerate
7
+ scipy
8
+ wandb
save.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import upload_folder
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+
6
+ hf_token = ""***HIDDEN_TOKEN***""
7
+ repo_id = "ajkndfjsdfasdf/mt5-small-bigdataset"
8
+ folder_path = "."
9
+
10
+ ignored = {".git", "__pycache__", ".ipynb_checkpoints", "wandb", "logs", "temp_hf_repo"}
11
+
12
+ # 🧼 Временная папка
13
+ clean_dir = tempfile.mkdtemp()
14
+
15
+ # Копируем все файлы, кроме игнорируемых
16
+ for item in os.listdir(folder_path):
17
+ if item in ignored:
18
+ continue
19
+ src = os.path.join(folder_path, item)
20
+ dst = os.path.join(clean_dir, item)
21
+
22
+ if os.path.isdir(src):
23
+ shutil.copytree(src, dst, dirs_exist_ok=True)
24
+ elif os.path.isfile(src):
25
+ # Если файл .py и содержит токен — заменим токен на "***"
26
+ if src.endswith(".py"):
27
+ with open(src, "r") as f:
28
+ content = f.read()
29
+ # Маскируем токен в коде
30
+ content = content.replace(hf_token, '"***HIDDEN_TOKEN***"')
31
+ with open(dst, "w") as f:
32
+ f.write(content)
33
+ else:
34
+ shutil.copy2(src, dst)
35
+
36
+ # 🚀 Загружаем на Hugging Face
37
+ upload_folder(
38
+ repo_id=repo_id,
39
+ folder_path=clean_dir,
40
+ repo_type="model",
41
+ token=hf_token,
42
+ commit_message="🚀 Full upload with token redacted"
43
+ )
44
+
45
+ print(f"✅ Всё загружено на: https://huggingface.co/{repo_id}")
test.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MT5ForConditionalGeneration, MT5Tokenizer
2
+ from datasets import load_dataset
3
+ import torch
4
+
5
+ # Путь к модели и данным
6
+ model_path = "./mt5-finetuned"
7
+ validation_file = "mt5_validation_data-1.jsonl"
8
+
9
+ # Загрузка модели и токенизатора
10
+ tokenizer = MT5Tokenizer.from_pretrained(model_path)
11
+ model = MT5ForConditionalGeneration.from_pretrained(model_path)
12
+ model.eval()
13
+
14
+ # Используем GPU если есть
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ model = model.to(device)
17
+
18
+ # Загрузка валидационной выборки
19
+ dataset = load_dataset("json", data_files={"validation": validation_file})
20
+ val_data = dataset["validation"]
21
+
22
+ # Функция предсказания
23
+ def predict(text):
24
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
25
+ outputs = model.generate(
26
+ **inputs,
27
+ max_length=64,
28
+ num_beams=5,
29
+ early_stopping=True
30
+ )
31
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
32
+
33
+ # Подсчёт точности
34
+ correct = 0
35
+ results = []
36
+
37
+ for idx, example in enumerate(val_data):
38
+ text = example["text"]
39
+ target = example["target"].strip()
40
+ pred = predict(text).strip()
41
+
42
+ results.append((text, pred, target))
43
+ if pred == target:
44
+ correct += 1
45
+
46
+ # Примеры
47
+ print("📋 Примеры предсказаний:\n")
48
+ for i, (text, pred, target) in enumerate(results[:80]): # кол-во примеров
49
+ print(f"#{i+1}")
50
+ print(f"📝 Вход: {text}")
51
+ print(f"✅ Target: {target}")
52
+ print(f"🤖 Предсказание: {pred}")
53
+ print("-" * 50)
54
+
55
+ # Accuracy
56
+ accuracy = correct / len(val_data)
57
+ print(f"\n✅ Accuracy: {accuracy:.4f} ({correct}/{len(val_data)})\n")
train.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ import os
4
+ import wandb
5
+
6
+ #cd workspace && pip install --no-cache-dir -r requirements.txt
7
+ #apt-get update && apt-get install -y screen & apt install git-lfs -y
8
+ #screen -S train
9
+ #python train.py
10
+
11
+ # Загружаем модель и токенизатор
12
+ model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
13
+ tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
14
+
15
+ # Загружаем датасет
16
+ data_files = {
17
+ "train": "mt5_training_data-1.jsonl",
18
+ "validation": "mt5_validation_data-1.jsonl"
19
+ }
20
+ dataset = load_dataset("json", data_files=data_files)
21
+
22
+ # Токенизация
23
+ def tokenize_function(examples):
24
+ model_inputs = tokenizer(examples["text"], max_length=256, truncation=True, padding="max_length")
25
+ labels = tokenizer(examples["target"], max_length=64, truncation=True, padding="max_length")
26
+ model_inputs["labels"] = labels["input_ids"]
27
+ return model_inputs
28
+
29
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
+
31
+ wandb.login(key="5f028bc0142fb7fa45bdacdde3c00dbbaf8bf98e")
32
+
33
+ training_args = TrainingArguments(
34
+ output_dir="./mt5-finetuned",
35
+ evaluation_strategy="steps",
36
+ eval_steps=100,
37
+ learning_rate=5e-5,
38
+ per_device_train_batch_size=250,
39
+ per_device_eval_batch_size=250,
40
+ num_train_epochs=3,
41
+ logging_steps=100,
42
+ warmup_ratio=0.06,
43
+ logging_first_step=True,
44
+ weight_decay=0.01,
45
+ logging_dir="./logs",
46
+ save_total_limit=2,
47
+ save_strategy="epoch",
48
+ report_to="wandb",
49
+ run_name="mt5-finetuning-run",
50
+ disable_tqdm=False,
51
+ max_grad_norm=1.0
52
+ )
53
+
54
+
55
+ trainer = Trainer(
56
+ model=model,
57
+ args=training_args,
58
+ train_dataset=tokenized_datasets["train"],
59
+ eval_dataset=tokenized_datasets["validation"]
60
+ )
61
+
62
+ # Обучение
63
+ #trainer.train()
64
+ trainer.train(resume_from_checkpoint=True)
65
+
66
+ # Сохраняем локально
67
+ model.save_pretrained("./mt5-finetuned")
68
+ tokenizer.save_pretrained("./mt5-finetuned")
69
+ print("✅ Модель сохранена локально в ./mt5-finetuned")