File size: 2,714 Bytes
03f139a
 
 
 
 
ecea783
03f139a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0044d08
03f139a
 
 
 
 
0044d08
 
 
 
 
 
 
03f139a
 
 
 
 
 
 
 
 
 
864cb91
03f139a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import json

from spacy.language import Language
from spacy.matcher import PhraseMatcher

# Default normalization table, that can be customized by passing it to the component as a parameter.
default_normalization_table = {
    "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
    "Ethernet": [
        "Ethernet LAN",
        "Ethernet port RJ-45",
        "Ethernet RJ-45",
        "Ethernet RJ45",
        "Ethernet-LAN RJ-45",
        "LAN RJ45",
        "Ethernet R45",
    ],
    "CI+ Slot": [
        "CI+ Card Slot",
        "Common Interface Plus (CI+)",
        "Common Interface Plus",
        "Card Slot CI +",
    ],
    "Scart": ["SCART", "Scart Input"],
    "Component In": [
        "Component In",
        "Component in(YPbPr)",
        "Component Input",
        "Component  (Y/Pb/Pr)",
        "Component In (Y/Pb/Pr)",
    ],
    "USB 2.0": ["USB2.0"],
    "Digital Audio": [
        "Digital Audio Out",
        "Digital Audio Output",
        "Digital Audio Output(Coaxial and Optic)",
    ],
    "Composite In": ["Composite", "AV Composite In"],
    "3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"],
    "Optical Audio Out": ["Optical Audio Out", "Optical Out"],
    "Android": ["ANDROID"],
    "Android 7.1": ["Android Nougat"],
    "Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"],
    "VIDAA U4": ["VIDAA U4.0"],
    "Android TV": ["Android TV", "AndroidTV", "Android"],
    "Titan OS": ["TITAN OS"],
    "7680x4320": ["8K"],
    "3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"],
    "1920x1080": ["FullHD", "Full HD"],
    "1366x768": ["HD Ready", "HDReady"],
    "1280x720": ["HD"],
    "640x480": ["SD"],
    "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
    "Blutooth": ["BLUETOOTH"],
}


@Language.factory("normalizer_component")
class NormalizerComponent(object):
    def __init__(self, nlp, name, norms=None):
        if norms is None:
            self.norm_table = default_normalization_table
        elif isinstance(norms, str):
            self.norm_table = json.load(open(norms))
        else:
            self.norm_table = norms

        self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.nlp = nlp

        for name, patterns in self.norm_table.items():
            self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns])

    def __call__(self, doc):
        for ent in doc.ents:
            for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)):
                match_id_string = self.nlp.vocab.strings[match_id]
                ent._.text = match_id_string.strip()
        return doc