| | import spacy |
| | from spacy.language import Language |
| | import regex |
| |
|
| |
|
| | @Language.component("entity_punctuation_removal") |
| |
|
| |
|
| | def entity_punctuation_removal(doc): |
| | |
| | ents = list(doc.ents) |
| |
|
| | i = 0 |
| | while i < len(ents): |
| | current_ent = ents[i] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | if i + 1 < len(ents) and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" : |
| | ents.pop(i) |
| | elif i == len(ents) - 1 and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" : |
| | ents.pop(i) |
| | else: |
| | i += 1 |
| |
|
| | |
| | doc.ents = tuple(ents) |
| |
|
| | return doc |
| |
|
| | Language.component("entity_punctuation_removal", func=entity_punctuation_removal) |