rbawden commited on
Commit
badaded
·
2 Parent(s): 36c761d f36da82

Merge branch 'main' of https://huggingface.co/rbawden/modern_french_normalisation into main

Browse files
Files changed (3) hide show
  1. config.json +6 -3
  2. modern_french_normalisation.py +0 -879
  3. pytorch_model.bin +2 -2
config.json CHANGED
@@ -9,7 +9,7 @@
9
  "attention_dropout": 0.0,
10
  "bos_token_id": 0,
11
  "custom_pipelines": {
12
- "modern_french_normalisation": {
13
  "default": {
14
  "model": {
15
  "pt": [
@@ -22,7 +22,8 @@
22
  "pt": [
23
  "AutoModelForSeq2SeqLM"
24
  ],
25
- "tf": []
 
26
  }
27
  },
28
  "d_model": 256,
@@ -31,6 +32,7 @@
31
  "add_cross_attention": false,
32
  "architectures": null,
33
  "bad_words_ids": null,
 
34
  "bos_token_id": 2,
35
  "chunk_size_feed_forward": 0,
36
  "cross_attention_hidden_size": null,
@@ -74,6 +76,7 @@
74
  "return_dict": true,
75
  "return_dict_in_generate": false,
76
  "sep_token_id": null,
 
77
  "task_specific_params": null,
78
  "temperature": 1.0,
79
  "tf_legacy_loss": false,
@@ -84,7 +87,7 @@
84
  "top_p": 1.0,
85
  "torch_dtype": null,
86
  "torchscript": false,
87
- "transformers_version": "4.21.2",
88
  "typical_p": 1.0,
89
  "use_bfloat16": false,
90
  "vocab_size": 1000
 
9
  "attention_dropout": 0.0,
10
  "bos_token_id": 0,
11
  "custom_pipelines": {
12
+ "modern-french-normalisation": {
13
  "default": {
14
  "model": {
15
  "pt": [
 
22
  "pt": [
23
  "AutoModelForSeq2SeqLM"
24
  ],
25
+ "tf": [],
26
+ "type": "text"
27
  }
28
  },
29
  "d_model": 256,
 
32
  "add_cross_attention": false,
33
  "architectures": null,
34
  "bad_words_ids": null,
35
+ "begin_suppress_tokens": null,
36
  "bos_token_id": 2,
37
  "chunk_size_feed_forward": 0,
38
  "cross_attention_hidden_size": null,
 
76
  "return_dict": true,
77
  "return_dict_in_generate": false,
78
  "sep_token_id": null,
79
+ "suppress_tokens": null,
80
  "task_specific_params": null,
81
  "temperature": 1.0,
82
  "tf_legacy_loss": false,
 
87
  "top_p": 1.0,
88
  "torch_dtype": null,
89
  "torchscript": false,
90
+ "transformers_version": "4.25.1",
91
  "typical_p": 1.0,
92
  "use_bfloat16": false,
93
  "vocab_size": 1000
modern_french_normalisation.py DELETED
@@ -1,879 +0,0 @@
1
- #!/usr/bin/python
2
- from transformers import Pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
3
- from transformers.tokenization_utils_base import TruncationStrategy
4
- from torch import Tensor
5
- import html.parser
6
- import unicodedata
7
- import sys, os
8
- import re
9
- import pickle
10
- from tqdm.auto import tqdm
11
- import operator
12
- from datasets import load_dataset
13
-
14
-
15
- def basic_tokenise(string):
16
- # separate punctuation
17
- for char in r',.;?!:)("…-':
18
- string = re.sub('(?<! )' + re.escape(char) + '+', ' ' + char, string)
19
- for char in '\'"’':
20
- string = re.sub(char + '(?! )' , char + ' ', string)
21
- return string.strip()
22
-
23
- def basic_tokenise_bs(string):
24
- # separate punctuation
25
- string = re.sub('(?<! )([,\.;\?!:\)\("…\'‘’”“«»\-])', r' \1', string)
26
- string = re.sub('([,\.;\?!:\)\("…\'‘’”“«»\-])(?! )' , r'\1 ', string)
27
- return string.strip()
28
-
29
- def homogenise(sent, allow_alter_length=False):
30
- '''
31
- Homogenise an input sentence by lowercasing, removing diacritics, etc.
32
- If allow_alter_length is False, then only applies changes that do not alter
33
- the length of the original sentence (i.e. one-to-one modifications). If True,
34
- then also apply n-m replacements.
35
- '''
36
- sent = sent.lower()
37
- # n-m replacemenets
38
- if allow_alter_length:
39
- for before, after in [('ã', 'an'), ('xoe', 'œ')]:
40
- sent = sent.replace(before, after)
41
- sent = sent.strip('-')
42
- # 1-1 replacements only (must not change the number of characters
43
- replace_from = "ǽǣáàâäąãăåćčçďéèêëęěğìíîĩĭıïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżžÁÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÍÎĨĬİÏĹĽŁŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÝŶŸŹẐŻŽſ"
44
- replace_into = "ææaaaaaaaacccdeeeeeegiiiiiiilllnnnoooooorrsssttuuuuuuyyyyzzzzAAAAAAAACCCDEEEEEEGIIIIIIILLLNNNOOOOOORRSSSTTUUUUUUYYYYZZZZs"
45
- table = sent.maketrans(replace_from, replace_into)
46
- return sent.translate(table)
47
-
48
- ######## Edit distance functions #######
49
- def _wedit_dist_init(len1, len2):
50
- lev = []
51
- for i in range(len1):
52
- lev.append([0] * len2) # initialize 2D array to zero
53
- for i in range(len1):
54
- lev[i][0] = i # column 0: 0,1,2,3,4,...
55
- for j in range(len2):
56
- lev[0][j] = j # row 0: 0,1,2,3,4,...
57
- return lev
58
-
59
-
60
- def _wedit_dist_step(
61
- lev, i, j, s1, s2, last_left, last_right, transpositions=False
62
- ):
63
- c1 = s1[i - 1]
64
- c2 = s2[j - 1]
65
-
66
- # skipping a character in s1
67
- a = lev[i - 1][j] + _wedit_dist_deletion_cost(c1,c2)
68
- # skipping a character in s2
69
- b = lev[i][j - 1] + _wedit_dist_insertion_cost(c1,c2)
70
- # substitution
71
- c = lev[i - 1][j - 1] + (_wedit_dist_substitution_cost(c1, c2) if c1 != c2 else 0)
72
-
73
- # pick the cheapest
74
- lev[i][j] = min(a, b, c)#, d)
75
-
76
- def _wedit_dist_backtrace(lev):
77
- i, j = len(lev) - 1, len(lev[0]) - 1
78
- alignment = [(i, j, lev[i][j])]
79
-
80
- while (i, j) != (0, 0):
81
- directions = [
82
- (i - 1, j), # skip s1
83
- (i, j - 1), # skip s2
84
- (i - 1, j - 1), # substitution
85
- ]
86
-
87
- direction_costs = (
88
- (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
89
- for i, j in directions
90
- )
91
- _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
92
-
93
- alignment.append((i, j, lev[i][j]))
94
- return list(reversed(alignment))
95
-
96
- def _wedit_dist_substitution_cost(c1, c2):
97
- if c1 == ' ' and c2 != ' ':
98
- return 1000000
99
- if c2 == ' ' and c1 != ' ':
100
- return 30
101
- for c in ",.;-!?'":
102
- if c1 == c and c2 != c:
103
- return 20
104
- if c2 == c and c1 != c:
105
- return 20
106
- return 1
107
-
108
- def _wedit_dist_deletion_cost(c1, c2):
109
- if c1 == ' ':
110
- return 2
111
- if c2 == ' ':
112
- return 1000000
113
- return 0.8
114
-
115
- def _wedit_dist_insertion_cost(c1, c2):
116
- if c1 == ' ':
117
- return 1000000
118
- if c2 == ' ':
119
- return 2
120
- return 0.8
121
-
122
- def wedit_distance_align(s1, s2):
123
- """
124
- Calculate the minimum Levenshtein weighted edit-distance based alignment
125
- mapping between two strings. The alignment finds the mapping
126
- from string s1 to s2 that minimizes the edit distance cost, where each
127
- operation is weighted by a dedicated weighting function.
128
- For example, mapping "rain" to "shine" would involve 2
129
- substitutions, 2 matches and an insertion resulting in
130
- the following mapping:
131
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
132
- NB: (0, 0) is the start state without any letters associated
133
- See more: https://web.stanford.edu/class/cs124/lec/med.pdf
134
- In case of multiple valid minimum-distance alignments, the
135
- backtrace has the following operation precedence:
136
- 1. Skip s1 character
137
- 2. Skip s2 character
138
- 3. Substitute s1 and s2 characters
139
- The backtrace is carried out in reverse string order.
140
- This function does not support transposition.
141
- :param s1, s2: The strings to be aligned
142
- :type s1: str
143
- :type s2: str
144
- :rtype: List[Tuple(int, int)]
145
- """
146
- # set up a 2-D array
147
- len1 = len(s1)
148
- len2 = len(s2)
149
- lev = _wedit_dist_init(len1 + 1, len2 + 1)
150
-
151
- # iterate over the array
152
- for i in range(len1):
153
- for j in range(len2):
154
- _wedit_dist_step(
155
- lev,
156
- i + 1,
157
- j + 1,
158
- s1,
159
- s2,
160
- 0,
161
- 0,
162
- transpositions=False,
163
- )
164
-
165
- # backtrace to find alignment
166
- alignment = _wedit_dist_backtrace(lev)
167
- return alignment
168
-
169
- def _last_left_t_init(sigma):
170
- return {c: 0 for c in sigma}
171
-
172
- def wedit_distance(s1, s2):
173
- """
174
- Calculate the Levenshtein weighted edit-distance between two strings.
175
- The weighted edit distance is the number of characters that need to be
176
- substituted, inserted, or deleted, to transform s1 into s2, weighted
177
- by a dedicated weighting function.
178
- For example, transforming "rain" to "shine" requires three steps,
179
- consisting of two substitutions and one insertion:
180
- "rain" -> "sain" -> "shin" -> "shine". These operations could have
181
- been done in other orders, but at least three steps are needed.
182
-
183
- Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
184
- because sometimes it makes sense to assign greater penalties to
185
- substitutions.
186
-
187
- This also optionally allows transposition edits (e.g., "ab" -> "ba"),
188
- though this is disabled by default.
189
-
190
- :param s1, s2: The strings to be analysed
191
- :param transpositions: Whether to allow transposition edits
192
- :type s1: str
193
- :type s2: str
194
- :type substitution_cost: int
195
- :type transpositions: bool
196
- :rtype: int
197
- """
198
- # set up a 2-D array
199
- len1 = len(s1)
200
- len2 = len(s2)
201
- lev = _wedit_dist_init(len1 + 1, len2 + 1)
202
-
203
- # retrieve alphabet
204
- sigma = set()
205
- sigma.update(s1)
206
- sigma.update(s2)
207
-
208
- # set up table to remember positions of last seen occurrence in s1
209
- last_left_t = _last_left_t_init(sigma)
210
-
211
- # iterate over the array
212
- # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
213
- # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
214
- for i in range(len1):
215
- last_right_buf = 0
216
- for j in range(len2):
217
- last_left = last_left_t[s2[j - 1]]
218
- last_right = last_right_buf
219
- if s1[i - 1] == s2[j - 1]:
220
- last_right_buf = j
221
- _wedit_dist_step(
222
- lev,
223
- i + 1,
224
- j + 1,
225
- s1,
226
- s2,
227
- last_left,
228
- last_right,
229
- transpositions=False,
230
- )
231
- last_left_t[s1[i - 1]] = i
232
- return lev[len1-1][len2-1]
233
-
234
- def space_after(idx, sent):
235
- if idx < len(sent) -1 and sent[idx + 1] == ' ':
236
- return True
237
- return False
238
-
239
- def space_before(idx, sent):
240
- if idx > 0 and sent[idx - 1] == ' ':
241
- return True
242
- return False
243
-
244
- ######## Normaliation pipeline #########
245
- class NormalisationPipeline(Pipeline):
246
-
247
- def __init__(self, beam_size=5, batch_size=32, tokenise_func=None, cache_file=None, no_postproc_lex=False,
248
- no_post_clean=False, **kwargs):
249
- self.beam_size = beam_size
250
- # classic tokeniser function (used for alignments)
251
- if tokenise_func is not None:
252
- self.classic_tokenise = tokenise_func
253
- else:
254
- self.classic_tokenise = basic_tokenise
255
-
256
- self.no_post_clean = no_post_clean
257
- self.no_postproc_lex = no_postproc_lex
258
- # load lexicon
259
- if no_postproc_lex:
260
- self.orig_lefff_words, self.mapping_to_lefff, self.mapping_to_lefff2 = None, None, None
261
- else:
262
- self.orig_lefff_words, self.mapping_to_lefff, self.mapping_to_lefff2 = self.load_lexicon(cache_file=cache_file)
263
- super().__init__(**kwargs)
264
-
265
-
266
- def load_lexicon(self, cache_file=None):
267
- orig_lefff_words = []
268
- mapping_to_lefff = {}
269
- mapping_to_lefff2 = {}
270
- remove = set([])
271
- remove2 = set([])
272
-
273
- # load pickled version if there
274
- if cache_file is not None and os.path.exists(cache_file):
275
- return pickle.load(open(cache_file, 'rb'))
276
- dataset = load_dataset("sagot/lefff_morpho")
277
-
278
- for entry in set([x['form'].lower() for x in dataset['test']]):
279
- orig_lefff_words.append(entry)
280
- orig_lefff_words.append("-"+entry)
281
- for mod_entry in set(self._create_modified_versions(entry)):
282
- if mod_entry in mapping_to_lefff and mapping_to_lefff[mod_entry] != entry:
283
- remove.add(mod_entry)
284
- if mod_entry != mod_entry.upper():
285
- remove.add(mod_entry)
286
- if mod_entry not in mapping_to_lefff and mod_entry != entry:
287
- mapping_to_lefff[mod_entry] = entry
288
- if mod_entry != mod_entry.upper():
289
- mapping_to_lefff2[mod_entry.upper()] = entry.upper()
290
- for mod_entry2 in set(self._create_modified_versions(mod_entry)):
291
- if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
292
- remove2.add(mod_entry2)
293
- if mod_entry2 != mod_entry2.upper():
294
- remove2.add(mod_entry2)
295
- if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
296
- mapping_to_lefff2[mod_entry2] = entry
297
- if mod_entry2 != mod_entry2.upper():
298
- mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
299
- for mod_entry2 in set(self._create_further_modified_versions(mod_entry)):
300
- if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
301
- remove2.add(mod_entry2)
302
- if mod_entry2 != mod_entry2.upper():
303
- remove2.add(mod_entry2)
304
- if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
305
- mapping_to_lefff2[mod_entry2] = entry
306
- if mod_entry2 != mod_entry2.upper():
307
- mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
308
- for mod_entry2 in set(self._create_further_modified_versions(entry)):
309
- if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
310
- remove2.add(mod_entry2)
311
- if mod_entry2 != mod_entry2.upper():
312
- remove2.add(mod_entry2)
313
- if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
314
- mapping_to_lefff2[mod_entry2] = entry
315
- if mod_entry2 != mod_entry2.upper():
316
- mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
317
-
318
- for mod_entry in list(mapping_to_lefff.keys()):
319
- if mod_entry != "":
320
- mapping_to_lefff["-"+mod_entry] = "-"+mapping_to_lefff[mod_entry]
321
- for mod_entry2 in list(mapping_to_lefff2.keys()):
322
- if mod_entry2 != "":
323
- mapping_to_lefff2["-"+mod_entry2] = "-"+mapping_to_lefff2[mod_entry2]
324
-
325
- for entry in remove:
326
- del mapping_to_lefff[entry]
327
- for entry in remove2:
328
- del mapping_to_lefff2[entry]
329
-
330
- if cache_file is not None:
331
- pickle.dump((orig_lefff_words, mapping_to_lefff, mapping_to_lefff2), open(cache_file, 'wb'))
332
- return orig_lefff_words, mapping_to_lefff, mapping_to_lefff2
333
-
334
- def _create_modified_versions(self, entry=None):
335
- if entry is None:
336
- return []
337
- return self._remove_diacritics(entry), self._vu_vowel_to_v_vowel(entry), self._vowel_u_to_vowel_v(entry), self._consonant_v_to_consonant_u(entry), self._y_to_i(entry), self._i_to_y(entry), self._eacute_to_e_s(entry), self._final_eacute_to_e_z(entry), self._egrave_to_eacute(entry), self._vowelcircumflex_to_vowel_s(entry), self._ce_to_ee(entry)
338
-
339
- def _create_further_modified_versions(self, entry=None):
340
- if entry is None:
341
- return []
342
- return self._s_to_f(entry), self._ss_to_ff(entry), self._s_to_ff(entry), self._first_s_to_f(entry), self._first_s_to_ff(entry), self._last_s_to_f(entry), self._last_s_to_ff(entry), self._sit_to_st(entry), self._ee_to_ce(entry), self._z_to_s(entry)
343
-
344
- def _remove_diacritics(self, s=None, allow_alter_length=True):
345
- # 1-1 replacements only (must not change the number of characters
346
- replace_from = "ǽǣáàâäąãăåćčçďéèêëęěğìíîĩĭıïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżžÁÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÍÎĨĬİÏĹĽŁŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÝŶŸŹẐŻŽſ"
347
- replace_into = "ææaaaaaaaacccdeeeeeegiiiiiiilllnnnoooooorrsssttuuuuuuyyyyzzzzAAAAAAAACCCDEEEEEEGIIIIIIILLLNNNOOOOOORRSSSTTUUUUUUYYYYZZZZs"
348
- table = s.maketrans(replace_from, replace_into)
349
- s = s.translate(table)
350
- # n-m replacemenets
351
- if allow_alter_length:
352
- for before, after in [
353
- ('œ', 'oe'),
354
- ('æ', 'ae'),
355
- ('ƣ', 'oi'),
356
- ('ij', 'ij'),
357
- ('ȣ', 'ou'),
358
- ('Œ', 'OE'),
359
- ('Æ', 'AE'),
360
- ('Ƣ', 'OI'),
361
- ('IJ', 'IJ'),
362
- ('Ȣ', 'OU')
363
- ]:
364
- s = s.replace(before, after)
365
- s = s.strip('-')
366
- return s
367
-
368
- def _vu_vowel_to_v_vowel(self, s=None):
369
- s = re.sub('v([aeiou])' , r'vu\1', s)
370
- return s
371
-
372
- def _vowel_u_to_vowel_v(self, s=None):
373
- s = re.sub('([aeiou])u' , r'\1v', s)
374
- return s
375
-
376
- def _consonant_v_to_consonant_u(self, s=None):
377
- s = re.sub('([^aeiou])v' , r'\1u', s)
378
- return s
379
-
380
- def _y_to_i(self, s=None):
381
- s = s.replace('y', 'i')
382
- return s
383
-
384
- def _i_to_y(self, s=None):
385
- s = s.replace('i', 'y')
386
- return s
387
-
388
- def _ss_to_ff(self, s=None):
389
- s = s.replace('ss', 'ff')
390
- return s
391
-
392
- def _s_to_f(self, s=None):
393
- s = s.replace('s', 'f')
394
- return s
395
-
396
- def _s_to_ff(self, s=None):
397
- s = s.replace('s', 'ff')
398
- return s
399
-
400
- def _first_s_to_f(self, s=None):
401
- s = re.sub('s' , r'f', s)
402
- return s
403
-
404
- def _last_s_to_f(self, s=None):
405
- s = re.sub('^(.*)s' , r'\1f', s)
406
- return s
407
-
408
- def _first_s_to_ff(self, s=None):
409
- s = re.sub('s' , r'ff', s)
410
- return s
411
-
412
- def _last_s_to_ff(self, s=None):
413
- s = re.sub('^(.*)s' , r'\1ff', s)
414
- return s
415
-
416
- def _ee_to_ce(self, s=None):
417
- s = s.replace('ee', 'ce')
418
- return s
419
-
420
- def _sit_to_st(self, s=None):
421
- s = s.replace('sit', 'st')
422
- return s
423
-
424
- def _z_to_s(self, s=None):
425
- s = s.replace('z', 's')
426
- return s
427
-
428
- def _ce_to_ee(self, s=None):
429
- s = s.replace('ce', 'ee')
430
- return s
431
-
432
- def _eacute_to_e_s(self, s=None, allow_alter_length=True):
433
- if allow_alter_length:
434
- s = re.sub('é(.)' , r'es\1', s)
435
- s = re.sub('ê(.)' , r'es\1', s)
436
- return s
437
-
438
- def _final_eacute_to_e_z(self, s=None, allow_alter_length=True):
439
- if allow_alter_length:
440
- s = re.sub('é$' , r'ez', s)
441
- s = re.sub('ê$' , r'ez', s)
442
- return s
443
-
444
- def _egrave_to_eacute(self, s=None):
445
- s = re.sub('è(.)' , r'é\1', s)
446
- return s
447
-
448
- def _vowelcircumflex_to_vowel_s(self, s=None, allow_alter_length=True):
449
- if allow_alter_length:
450
- for before, after in [
451
- ('â', 'as'),
452
- ('ê', 'es'),
453
- ('î', 'is'),
454
- ('ô', 'os'),
455
- ('û', 'us'),
456
- ]:
457
- s = s.replace(before, after)
458
- return s
459
-
460
- def _sanitize_parameters(self, clean_up_tokenisation_spaces=None, truncation=None, **generate_kwargs):
461
- preprocess_params = {}
462
- if truncation is not None:
463
- preprocess_params["truncation"] = truncation
464
- forward_params = generate_kwargs
465
- postprocess_params = {}
466
- if clean_up_tokenisation_spaces is not None:
467
- postprocess_params["clean_up_tokenisation_spaces"] = clean_up_tokenisation_spaces
468
-
469
- return preprocess_params, forward_params, postprocess_params
470
-
471
-
472
- def check_inputs(self, input_length: int, min_length: int, max_length: int):
473
- """
474
- Checks whether there might be something wrong with given input with regard to the model.
475
- """
476
- return True
477
-
478
- def make_printable(self, s):
479
- '''Replace non-printable characters in a string.'''
480
- return s.translate(NOPRINT_TRANS_TABLE)
481
-
482
-
483
- def normalise(self, line):
484
- for before, after in [('[«»\“\”]', '"'), ('[‘’]', "'"), (' +', ' '), ('\"+', '"'),
485
- ("'+", "'"), ('^ *', ''), (' *$', '')]:
486
- line = re.sub(before, after, line)
487
- return line.strip() + ' </s>'
488
-
489
- def _parse_and_tokenise(self, *args, truncation):
490
- prefix = ""
491
- if isinstance(args[0], list):
492
- if self.tokenizer.pad_token_id is None:
493
- raise ValueError("Please make sure that the tokeniser has a pad_token_id when using a batch input")
494
- args = ([prefix + arg for arg in args[0]],)
495
- padding = True
496
-
497
- elif isinstance(args[0], str):
498
- args = (prefix + args[0],)
499
- padding = False
500
- else:
501
- raise ValueError(
502
- f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
503
- )
504
- inputs = [self.normalise(x) for x in args]
505
- inputs = self.tokenizer(inputs, padding=padding, truncation=truncation, return_tensors=self.framework)
506
- toks = []
507
- for tok_ids in inputs.input_ids:
508
- toks.append(" ".join(self.tokenizer.convert_ids_to_tokens(tok_ids)))
509
- # This is produced by tokenisers but is an invalid generate kwargs
510
- if "token_type_ids" in inputs:
511
- del inputs["token_type_ids"]
512
- return inputs
513
-
514
- def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
515
- inputs = self._parse_and_tokenise(inputs, truncation=truncation, **kwargs)
516
- return inputs
517
-
518
- def _forward(self, model_inputs, **generate_kwargs):
519
- in_b, input_length = model_inputs["input_ids"].shape
520
- generate_kwargs["min_length"] = generate_kwargs.get("min_length", self.model.config.min_length)
521
- generate_kwargs["max_length"] = generate_kwargs.get("max_length", self.model.config.max_length)
522
- generate_kwargs['num_beams'] = self.beam_size
523
- self.check_inputs(input_length, generate_kwargs["min_length"], generate_kwargs["max_length"])
524
- output_ids = self.model.generate(**model_inputs, **generate_kwargs)
525
- out_b = output_ids.shape[0]
526
- output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
527
- return {"output_ids": output_ids}
528
-
529
- def postprocess(self, model_outputs, clean_up_tok_spaces=False):
530
- records = []
531
- for output_ids in model_outputs["output_ids"][0]:
532
- record = {"text": self.tokenizer.decode(output_ids, skip_special_tokens=True,
533
- clean_up_tokenisation_spaces=clean_up_tok_spaces).strip()}
534
- records.append(record)
535
- return records
536
-
537
- def postprocess_correct_sent(self, alignment):
538
- output = []
539
- for i, (orig_word, pred_word, _) in enumerate(alignment):
540
- if orig_word != '':
541
- postproc_word = self.postprocess_correct_word(orig_word, pred_word, alignment)
542
- alignment[i] = (orig_word, postproc_word, -1) # replace prediction in the alignment
543
- return alignment
544
-
545
- def postprocess_correct_word(self, orig_word, pred_word, alignment):
546
- # pred_word exists in lexicon, take it
547
- orig_caps = self.get_caps(orig_word)
548
- if re.match("^[0-9]+$", orig_word) or re.match("^[XVUI]+$", orig_word):
549
- orig_word = orig_word.replace('U', 'V')
550
- # print('DEBUG:00: ', orig_word)
551
- return orig_word
552
- if pred_word.lower() in self.orig_lefff_words:
553
- #print('pred exists')
554
- # print('DEBUG:0a: ', orig_word, " => ", pred_word)
555
- return self.set_caps(pred_word, *orig_caps)
556
- # otherwise, if original word exists, take that
557
- if orig_word.lower() in self.orig_lefff_words:
558
- # print('DEBUG:0b: ', orig_word)
559
- return orig_word
560
-
561
- pred_replacement = None
562
- # otherwise if pred word is in the lexicon with some changes, take that
563
- if pred_word != '' and pred_word != ' ':
564
- pred_replacement = self.mapping_to_lefff.get(pred_word, None)
565
- if pred_replacement is not None:
566
- # print('DEBUG:1: ', pred_word, " (", pred_replacement, ", ", *orig_caps, ")")
567
- # print(" => ", self.add_orig_punct(pred_word, self.set_caps(pred_replacement, *orig_caps)))
568
- return self.add_orig_punct(pred_word, self.set_caps(pred_replacement, *orig_caps))
569
- # otherwise if orig word is in the lexicon with some changes, take that
570
- orig_replacement = self.mapping_to_lefff.get(orig_word, None)
571
- if orig_replacement is not None:
572
- # print('DEBUG:2: ', pred_word, " (", orig_replacement, ", ", *orig_caps, ")")
573
- # print(" => ", self.add_orig_punct(pred_word, self.set_caps(orig_replacement, *orig_caps)))
574
- return self.add_orig_punct(pred_word, self.set_caps(orig_replacement, *orig_caps))
575
-
576
- # otherwise if pred word is in the lexicon with more changes, take that
577
- if pred_word != '' and pred_word != ' ':
578
- pred_replacement = self.mapping_to_lefff2.get(pred_word, None)
579
- if pred_replacement is not None:
580
- # print('DEBUG:3: ', pred_word, " (", pred_replacement, ", ", *orig_caps, ")")
581
- # print(" => ", self.add_orig_punct(pred_word, self.set_caps(pred_replacement, *orig_caps)))
582
- return self.add_orig_punct(pred_word, self.set_caps(pred_replacement, *orig_caps))
583
- # otherwise if orig word is in the lexicon with more changes, take that
584
- orig_replacement = self.mapping_to_lefff2.get(orig_word, None)
585
- if orig_replacement is not None:
586
- # print('DEBUG:4: ', pred_word, " (", orig_replacement, ", ", *orig_caps, ")")
587
- # print(" => ", self.add_orig_punct(pred_word, self.set_caps(orig_replacement, *orig_caps)))
588
- return self.add_orig_punct(pred_word, self.set_caps(orig_replacement, *orig_caps))
589
-
590
- if orig_word == pred_word:
591
- # print('DEBUG:0c: <', orig_word, ">")
592
- return orig_word
593
- if orig_word == " " and pred_word == "":
594
- # print('DEBUG:0d: <', orig_word, ">")
595
- return orig_word
596
-
597
- wed = wedit_distance(pred_word,orig_word)
598
- if wed > 2:
599
- print("DEBUG:O",orig_word,"(P:",pred_word,":",wed,")")
600
- return orig_word
601
- print("DEBUG:P",self.add_orig_punct(pred_word, self.set_caps(pred_word, *orig_caps)),"(P:",pred_word,"vs. O:",orig_word,":",wed,")")
602
- return self.add_orig_punct(pred_word, self.set_caps(pred_word, *orig_caps))
603
-
604
- def get_surrounding_punct(self, word):
605
- beginning_match = re.match("^(['\-]*)", word)
606
- beginning, end = '', ''
607
- if beginning_match:
608
- beginning = beginning_match.group(1)
609
- end_match = re.match("(['\-]*)$", word)
610
- if end_match:
611
- end = end_match.group(1)
612
- return beginning, end
613
-
614
- def add_orig_punct(self, old_word, new_word):
615
- beginning, end = self.get_surrounding_punct(old_word)
616
- output = ''
617
- if beginning != None and not re.match("^"+re.escape(beginning), new_word):
618
- output += beginning
619
- if new_word != None:
620
- output += new_word
621
- if end != None and not re.match(re.escape(end)+"$", new_word):
622
- output += end
623
- return output
624
-
625
- def get_caps(self, word):
626
- # remove any non-alphatic characters at begining or end
627
- word = word.strip("-' ")
628
- first, second, allcaps = False, False, False
629
- if len(word) > 0 and word[0].lower() != word[0]:
630
- first = True
631
- if len(word) > 1 and word[1].lower() != word[1]:
632
- second = True
633
- if word.upper() == word and word.lower() != word:
634
- allcaps = True
635
- return first, second, allcaps
636
-
637
- def set_caps(self, word, first, second, allcaps):
638
- if word == None:
639
- return None
640
- if allcaps:
641
- return word.upper()
642
- elif first and second:
643
- return word[0].upper() + word[1].upper() + word[2:]
644
- elif first:
645
- if len(word) > 1:
646
- return word[0].upper() + word[1:]
647
- else:
648
- return word[0].upper() + word[1:]
649
- elif second:
650
- if len(word) > 2:
651
- return word[0] + word[1].upper() + word[2:]
652
- elif len(word) > 1:
653
- return word[0] + word[1].upper() + word[2:]
654
- else:
655
- return word[0]
656
- else:
657
- return word
658
-
659
- def __call__(self, input_sents, **kwargs):
660
- r"""
661
- Generate the output texts using texts given as inputs.
662
- Args:
663
- args (`List[str]`):
664
- Input text for the encoder.
665
- apply_postprocessing (`Bool`):
666
- Apply postprocessing using the lexicon
667
- generate_kwargs:
668
- Additional keyword arguments to pass along to the generate method of the model (see the generate method
669
- corresponding to your framework [here](./model#generative-models)).
670
- Return:
671
- A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
672
- - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
673
- - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
674
- ids of the generated text.
675
- """
676
- result = super().__call__(input_sents, **kwargs)
677
-
678
- output = []
679
- for i in range(len(result)):
680
- #os.sys.stderr.write(str(i) + ':' + input_sents[i].strip() + '\n')
681
- input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
682
- input_sent = input_sent.replace('ſ' , 's')
683
- if not self.no_post_clean:
684
- pred_sent = self.post_cleaning(pred_sent)
685
- alignment, pred_sent_tok = self.align(input_sent, pred_sent)
686
- #print(pred_sent)
687
- # print("ALIGNMENT: ", alignment)
688
- if not self.no_postproc_lex:
689
- alignment = self.postprocess_correct_sent(alignment)
690
- # print("POSTPROCESSED ALIGNMENT: ", alignment)
691
- pred_sent = self.get_pred_from_alignment(alignment)
692
- if not self.no_post_clean:
693
- pred_sent = self.post_cleaning(pred_sent)
694
- char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
695
- output.append({'text': pred_sent, 'alignment': char_spans})
696
- return output
697
-
698
- def post_cleaning(self, s):
699
- s = s.replace(' ' , '')
700
- s = s.replace('ſ' , 's')
701
- s = s.replace('ß' , 'ss')
702
- s = s.replace('&' , 'et')
703
- s = re.sub('ẽ([mbp])' , r'em\1', s)
704
- s = s.replace('ẽ' , 'en')
705
- s = re.sub('ã([mbp])' , r'am\1', s)
706
- s = s.replace('ã' , 'an')
707
- s = re.sub('õ([mbp])' , r'om\1', s)
708
- s = s.replace('õ' , 'on')
709
- s = re.sub('ũ([mbp])' , r'um\1', s)
710
- s = s.replace('ũ' , 'un')
711
- return s
712
-
713
- def align(self, sent_ref, sent_pred):
714
- # print("INPUT SENT: <",sent_ref,">")
715
- # print("PRED SENT: <",sent_pred,">")
716
- sent_ref_tok = self.classic_tokenise(re.sub('[ ]', ' ', sent_ref))
717
- sent_pred_tok = self.classic_tokenise(re.sub('[ ]', ' ', sent_pred))
718
- # print("INPUT SENT TOK: <",sent_ref_tok,">")
719
- # print("PRED SENT TOK: <",sent_pred_tok,">")
720
- backpointers = wedit_distance_align(homogenise(sent_ref_tok), homogenise(sent_pred_tok))
721
- alignment, current_word, seen1, seen2, last_weight = [], ['', ''], [], [], 0
722
- for i_ref, i_pred, weight in backpointers:
723
- if i_ref == 0 and i_pred == 0:
724
- continue
725
- # next characters are both spaces -> add current word straight away
726
- if i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' \
727
- and i_pred <= len(sent_pred_tok) and sent_pred_tok[i_pred-1] == ' ' \
728
- and i_ref not in seen1 and i_pred not in seen2:
729
-
730
- # if current word is empty -> insert a space on both sides
731
- if current_word[0] == '' and current_word[1] == '':
732
- alignment.append((' ', ' ', weight-last_weight))
733
- # else add the current word to both sides
734
- else:
735
- alignment.append((current_word[0], current_word[1], weight-last_weight))
736
- last_weight = weight
737
- current_word = ['', '']
738
- seen1.append(i_ref)
739
- seen2.append(i_pred)
740
- # if space in ref and dash in pred
741
- elif i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' \
742
- and i_pred <= len(sent_pred_tok) and sent_pred_tok[i_pred-1] == '-' \
743
- and i_ref not in seen1 and i_pred not in seen2 \
744
- and current_word[0] == '' and current_word[1] == '':
745
- alignment.append((' ', '', weight-last_weight))
746
- last_weight = weight
747
- current_word = ['', '-']
748
- seen1.append(i_ref)
749
- seen2.append(i_pred)
750
- else:
751
- end_space = '' #'░'
752
- # add new character to ref
753
- if i_ref <= len(sent_ref_tok) and i_ref not in seen1:
754
- if i_ref > 0:
755
- current_word[0] += sent_ref_tok[i_ref-1]
756
- seen1.append(i_ref)
757
- # add new character to pred
758
- if i_pred <= len(sent_pred_tok) and i_pred not in seen2:
759
- if i_pred > 0:
760
- current_word[1] += sent_pred_tok[i_pred-1] if sent_pred_tok[i_pred-1] != ' ' else ' ' #'▁'
761
- end_space = '' if space_after(i_pred, sent_pred_tok) else ''# '░'
762
- seen2.append(i_pred)
763
- if i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' and current_word[0].strip() != '':
764
- alignment.append((current_word[0].strip(), current_word[1].strip() + end_space, weight-last_weight))
765
- last_weight = weight
766
- current_word = ['', '']
767
- # space in ref but aligned to nothing in pred (under-translation)
768
- elif i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' and current_word[1].strip() == '':
769
- alignment.append((current_word[0], current_word[1], weight-last_weight))
770
- last_weight = weight
771
- current_word = ['', '']
772
- seen1.append(i_ref)
773
- seen2.append(i_pred)
774
- # final word
775
- alignment.append((current_word[0].strip(), current_word[1].strip(), weight-last_weight))
776
- # check that both strings are entirely covered
777
- recovered1 = re.sub(' +', ' ', ' '.join([x[0] for x in alignment]))
778
- recovered2 = re.sub(' +', ' ', ' '.join([x[1] for x in alignment]))
779
-
780
- assert re.sub('[  ]+', ' ', recovered1) == re.sub('[  ]+', ' ', sent_ref_tok), \
781
- '\n1: *' + re.sub('[  ]+', ' ', recovered1) + "*\n1: *" + re.sub('[  ]+', ' ', sent_ref_tok) + '*'
782
- assert re.sub('[░▁ ]+', '', recovered2) == re.sub('[▁ ]+', '', sent_pred_tok), \
783
- '\n2: ' + re.sub('[  ]+', ' ', recovered2) + "\n2: " + re.sub('[  ]+', ' ', sent_pred_tok)
784
- return alignment, sent_pred_tok
785
-
786
- def get_pred_from_alignment(self, alignment):
787
- return re.sub(' +', ' ', ''.join([x[1] if x[1] != '' else '\n' for x in alignment]).replace('\n', ''))
788
-
789
- def get_char_idx_align(self, sent_ref, sent_pred, alignment):
790
- covered_ref, covered_pred = 0, 0
791
- ref_chars = [i for i, character in enumerate(sent_ref)] + [len(sent_ref)] #
792
- pred_chars = [i for i, character in enumerate(sent_pred)] + [len(sent_pred)]# if character not in [' ']]
793
- align_idx = []
794
-
795
- for a_ref, a_pred, _ in alignment:
796
- if a_ref == '' and a_pred == '':
797
- covered_pred += 1
798
- continue
799
- a_pred = re.sub(' +', ' ', a_pred).strip()
800
- span_ref = [ref_chars[covered_ref], ref_chars[covered_ref + len(a_ref)]]
801
- covered_ref += len(a_ref)
802
- span_pred = [pred_chars[covered_pred], pred_chars[covered_pred + len(a_pred)]]
803
- covered_pred += len(a_pred)
804
- align_idx.append((span_ref, span_pred))
805
-
806
- return align_idx
807
-
808
- def normalise_text(list_sents, batch_size=32, beam_size=5, cache_file=None, no_postproc_lex=False, no_post_clean=False):
809
- tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation")
810
- model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation")
811
- normalisation_pipeline = NormalisationPipeline(model=model,
812
- tokenizer=tokeniser,
813
- batch_size=batch_size,
814
- beam_size=beam_size,
815
- cache_file=cache_file,
816
- no_postproc_lex=no_postproc_lex,
817
- no_post_clean=no_post_clean)
818
- normalised_outputs = normalisation_pipeline(list_sents)
819
- return normalised_outputs
820
-
821
- def normalise_from_stdin(batch_size=32, beam_size=5, cache_file=None, no_postproc_lex=False, no_post_clean=False):
822
- tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation")
823
- model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation")
824
- normalisation_pipeline = NormalisationPipeline(model=model,
825
- tokenizer=tokeniser,
826
- batch_size=batch_size,
827
- beam_size=beam_size,
828
- cache_file=cache_file,
829
- no_postproc_lex=no_postproc_lex,
830
- no_post_clean=no_post_clean
831
- )
832
- list_sents = []
833
- ex = ["7. Qu'vne force plus grande de ſi peu que l'on voudra, que celle auec laquelle l'eau de la hauteur de trente & vn pieds, tend à couler en bas, ſuffit pour faire admettre ce vuide apparent, & meſme ſi grãd que l'on voudra, c'eſt à dire, pour faire des-vnir les corps d'vn ſi grand interualle que l'on voudra, pourueu qu'il n'y ait point d'autre obſtacle à leur ſeparation ny à leur eſloignement, que l'horreur que la Nature a pour ce vuide apparent."]
834
- for sent in sys.stdin:
835
- list_sents.append(sent.strip())
836
- normalised_outputs = normalisation_pipeline(list_sents)
837
- for s, sent in enumerate(normalised_outputs):
838
- alignment=sent['alignment']
839
-
840
- print(sent['text'])
841
- # checking that the alignment makes sense
842
- #for b, a in alignment:
843
- # print('input: ' + ''.join([list_sents[s][x] for x in range(b[0], max(len(b), b[1]))]) + '')
844
- # print('pred: ' + ''.join([sent['text'][x] for x in range(a[0], max(len(a), a[1]))]) + '')
845
-
846
- return normalised_outputs
847
-
848
-
849
- if __name__ == '__main__':
850
- import argparse
851
- parser = argparse.ArgumentParser()
852
- parser.add_argument('-k', '--batch_size', type=int, default=32, help='Set the batch size for decoding')
853
- parser.add_argument('-b', '--beam_size', type=int, default=5, help='Set the beam size for decoding')
854
- parser.add_argument('-i', '--input_file', type=str, default=None, help='Input file. If None, read from STDIN')
855
- parser.add_argument('-c', '--cache_lexicon', type=str, default=None, help='Path to cache the lexicon file to speed up loading')
856
- parser.add_argument('-n', '--no_postproc_lex', default=False, action='store_true', help='Deactivate postprocessing to speed up normalisation, but this may degrade the output')
857
- parser.add_argument('-m', '--no_post_clean', default=False, action='store_true', help='Deactivate postprocessing to speed up normalisation, but this may degrade the output')
858
-
859
- args = parser.parse_args()
860
-
861
- if args.input_file is None:
862
- normalise_from_stdin(batch_size=args.batch_size,
863
- beam_size=args.beam_size,
864
- cache_file=args.cache_lexicon,
865
- no_postproc_lex=args.no_postproc_lex,
866
- no_post_clean=args.no_post_clean)
867
- else:
868
- list_sents = []
869
- with open(args.input_file) as fp:
870
- for line in fp:
871
- list_sents.append(line.strip())
872
- output_sents = normalise_text(list_sents,
873
- batch_size=args.batch_size,
874
- beam_size=args.beam_size,
875
- cache_file=args.cache_lexicon,
876
- no_postproc_lex=args.no_postproc_lex,
877
- no_post_clean=args.no_post_clean)
878
- for output_sent in output_sents:
879
- print(output_sent['text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:777e3ebf4be88372d6fa982cdff430b06d61461574236c7a213a37d70bd47085
3
- size 25265973
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16e283cd4628cac0f2b36f2e8181ae0ba0f65b0e03866fde6067a0cd8e3c78d8
3
+ size 25264557