| | from spacy.tokens import Doc, Span, Token |
| | from spacy.matcher import PhraseMatcher |
| | from spacy.util import filter_spans |
| | from spacy.language import Language |
| | import re |
| |
|
| | |
| | @Language.factory("count_extraction_component") |
| | class CountExtractorComponent(object): |
| | |
| | def __init__(self, nlp, name, label="CONNECTION"): |
| | self.label = label |
| | self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$") |
| | self.reg_right = re.compile(r"^(?P<name>.+)\s*[xX]\s*(?P<count>\d+)$") |
| | self.reg_right_inverted = re.compile(r"^(?P<name>.+)\s*(?P<count>\d+)\s*[xX]$") |
| | |
| | |
| | Span.set_extension("count", default=None, force=True) |
| | Span.set_extension("text", default=None, force=True) |
| | |
| | def __call__(self, doc): |
| | for ent in doc.ents: |
| | text = ent.text.strip() |
| | if ent.label_ != self.label: |
| | ent._.text = text |
| | continue |
| | |
| | m = self.reg_left.match(text) |
| | if m is not None: |
| | map = m.groupdict() |
| | ent._.text = map["name"].strip() |
| | ent._.count = int(map["count"]) |
| | continue |
| |
|
| | m = self.reg_right.match(text) |
| | if m is not None: |
| | map = m.groupdict() |
| | ent._.text = map["name"].strip() |
| | ent._.count = int(map["count"]) |
| | continue |
| |
|
| | m = self.reg_right_inverted.match(text) |
| | if m is not None: |
| | map = m.groupdict() |
| | ent._.text = map["name"].strip() |
| | ent._.count = int(map["count"]) |
| | continue |
| |
|
| | ent._.text = text |
| | ent._.count = 1 |
| | return doc |
| |
|