| | import json |
| |
|
| | from spacy.language import Language |
| | from spacy.matcher import PhraseMatcher |
| |
|
| | |
| | default_normalization_table = { |
| | "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"], |
| | "Ethernet": [ |
| | "Ethernet LAN", |
| | "Ethernet port RJ-45", |
| | "Ethernet RJ-45", |
| | "Ethernet RJ45", |
| | "Ethernet-LAN RJ-45", |
| | "LAN RJ45", |
| | "Ethernet R45", |
| | ], |
| | "CI+ Slot": [ |
| | "CI+ Card Slot", |
| | "Common Interface Plus (CI+)", |
| | "Common Interface Plus", |
| | "Card Slot CI +", |
| | ], |
| | "Scart": ["SCART", "Scart Input"], |
| | "Component In": [ |
| | "Component In", |
| | "Component in(YPbPr)", |
| | "Component Input", |
| | "Component (Y/Pb/Pr)", |
| | "Component In (Y/Pb/Pr)", |
| | ], |
| | "USB 2.0": ["USB2.0"], |
| | "Digital Audio": [ |
| | "Digital Audio Out", |
| | "Digital Audio Output", |
| | "Digital Audio Output(Coaxial and Optic)", |
| | ], |
| | "Composite In": ["Composite", "AV Composite In"], |
| | "3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"], |
| | "Optical Audio Out": ["Optical Audio Out", "Optical Out"], |
| | "Android": ["ANDROID"], |
| | "Android 7.1": ["Android Nougat"], |
| | "Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"], |
| | "VIDAA U4": ["VIDAA U4.0"], |
| | "Android TV": ["Android TV", "AndroidTV", "Android"], |
| | "Titan OS": ["TITAN OS"], |
| | "7680x4320": ["8K"], |
| | "3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"], |
| | "1920x1080": ["FullHD", "Full HD"], |
| | "1366x768": ["HD Ready", "HDReady"], |
| | "1280x720": ["HD"], |
| | "640x480": ["SD"], |
| | "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"], |
| | "Blutooth": ["BLUETOOTH"], |
| | } |
| |
|
| |
|
| | @Language.factory("normalizer_component") |
| | class NormalizerComponent(object): |
| | def __init__(self, nlp, name, norms=None): |
| | if norms is None: |
| | self.norm_table = default_normalization_table |
| | elif isinstance(norms, str): |
| | self.norm_table = json.load(open(norms)) |
| | else: |
| | self.norm_table = norms |
| |
|
| | self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") |
| | self.nlp = nlp |
| |
|
| | for name, patterns in self.norm_table.items(): |
| | self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns]) |
| |
|
| | def __call__(self, doc): |
| | for ent in doc.ents: |
| | for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)): |
| | match_id_string = self.nlp.vocab.strings[match_id] |
| | ent._.text = match_id_string.strip() |
| | return doc |
| |
|