File size: 2,714 Bytes
03f139a ecea783 03f139a 0044d08 03f139a 0044d08 03f139a 864cb91 03f139a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import json
from spacy.language import Language
from spacy.matcher import PhraseMatcher
# Default normalization table, that can be customized by passing it to the component as a parameter.
default_normalization_table = {
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
"Ethernet": [
"Ethernet LAN",
"Ethernet port RJ-45",
"Ethernet RJ-45",
"Ethernet RJ45",
"Ethernet-LAN RJ-45",
"LAN RJ45",
"Ethernet R45",
],
"CI+ Slot": [
"CI+ Card Slot",
"Common Interface Plus (CI+)",
"Common Interface Plus",
"Card Slot CI +",
],
"Scart": ["SCART", "Scart Input"],
"Component In": [
"Component In",
"Component in(YPbPr)",
"Component Input",
"Component (Y/Pb/Pr)",
"Component In (Y/Pb/Pr)",
],
"USB 2.0": ["USB2.0"],
"Digital Audio": [
"Digital Audio Out",
"Digital Audio Output",
"Digital Audio Output(Coaxial and Optic)",
],
"Composite In": ["Composite", "AV Composite In"],
"3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"],
"Optical Audio Out": ["Optical Audio Out", "Optical Out"],
"Android": ["ANDROID"],
"Android 7.1": ["Android Nougat"],
"Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"],
"VIDAA U4": ["VIDAA U4.0"],
"Android TV": ["Android TV", "AndroidTV", "Android"],
"Titan OS": ["TITAN OS"],
"7680x4320": ["8K"],
"3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"],
"1920x1080": ["FullHD", "Full HD"],
"1366x768": ["HD Ready", "HDReady"],
"1280x720": ["HD"],
"640x480": ["SD"],
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
"Blutooth": ["BLUETOOTH"],
}
@Language.factory("normalizer_component")
class NormalizerComponent(object):
def __init__(self, nlp, name, norms=None):
if norms is None:
self.norm_table = default_normalization_table
elif isinstance(norms, str):
self.norm_table = json.load(open(norms))
else:
self.norm_table = norms
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
self.nlp = nlp
for name, patterns in self.norm_table.items():
self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns])
def __call__(self, doc):
for ent in doc.ents:
for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)):
match_id_string = self.nlp.vocab.strings[match_id]
ent._.text = match_id_string.strip()
return doc
|