Update generic_ner.py
Browse files- generic_ner.py +13 -1
generic_ner.py
CHANGED
|
@@ -148,6 +148,18 @@ def realign(
|
|
| 148 |
return words_list, preds_list, confidence_list
|
| 149 |
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 152 |
|
| 153 |
def _sanitize_parameters(self, **kwargs):
|
|
@@ -166,7 +178,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
|
|
| 166 |
text, padding="max_length", truncation=True, max_length=512
|
| 167 |
)
|
| 168 |
|
| 169 |
-
text_sentence = tokenize(text)
|
| 170 |
return tokenized_inputs, text_sentence, text
|
| 171 |
|
| 172 |
def _forward(self, inputs):
|
|
|
|
| 148 |
return words_list, preds_list, confidence_list
|
| 149 |
|
| 150 |
|
| 151 |
+
import re, string
|
| 152 |
+
|
| 153 |
+
# List of additional "strange" punctuation marks
|
| 154 |
+
additional_punctuation = "‘’“”„«»•–—―‣◦…§¶†‡‰′″〈〉"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def add_spaces_around_punctuation(text):
|
| 158 |
+
# Add a space before and after all punctuation
|
| 159 |
+
all_punctuation = string.punctuation + additional_punctuation
|
| 160 |
+
return re.sub(r"([{}])".format(re.escape(all_punctuation)), r" \1 ", text)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
| 164 |
|
| 165 |
def _sanitize_parameters(self, **kwargs):
|
|
|
|
| 178 |
text, padding="max_length", truncation=True, max_length=512
|
| 179 |
)
|
| 180 |
|
| 181 |
+
text_sentence = tokenize(add_spaces_around_punctuation(text))
|
| 182 |
return tokenized_inputs, text_sentence, text
|
| 183 |
|
| 184 |
def _forward(self, inputs):
|