emanuelaboros commited on
Commit
0fa92dc
·
verified ·
1 Parent(s): edc7893

Update generic_ner.py

Browse files
Files changed (1) hide show
  1. generic_ner.py +13 -1
generic_ner.py CHANGED
@@ -148,6 +148,18 @@ def realign(
148
  return words_list, preds_list, confidence_list
149
 
150
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  class MultitaskTokenClassificationPipeline(Pipeline):
152
 
153
  def _sanitize_parameters(self, **kwargs):
@@ -166,7 +178,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
166
  text, padding="max_length", truncation=True, max_length=512
167
  )
168
 
169
- text_sentence = tokenize(text)
170
  return tokenized_inputs, text_sentence, text
171
 
172
  def _forward(self, inputs):
 
148
  return words_list, preds_list, confidence_list
149
 
150
 
151
+ import re, string
152
+
153
+ # List of additional "strange" punctuation marks
154
+ additional_punctuation = "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
155
+
156
+
157
+ def add_spaces_around_punctuation(text):
158
+ # Add a space before and after all punctuation
159
+ all_punctuation = string.punctuation + additional_punctuation
160
+ return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
161
+
162
+
163
  class MultitaskTokenClassificationPipeline(Pipeline):
164
 
165
  def _sanitize_parameters(self, **kwargs):
 
178
  text, padding="max_length", truncation=True, max_length=512
179
  )
180
 
181
+ text_sentence = tokenize(add_spaces_around_punctuation(text))
182
  return tokenized_inputs, text_sentence, text
183
 
184
  def _forward(self, inputs):