Update spaCy pipeline
Browse files- README.md +2 -2
- config.cfg +37 -2
- en_setec_mk_tv-any-py3-none-any.whl +2 -2
- feature_aggregator_component.py +101 -0
- meta.json +4 -2
- normalizer_component.py +8 -8
README.md
CHANGED
|
@@ -26,8 +26,8 @@ model-index:
|
|
| 26 |
| **Name** | `en_setec_mk_tv` |
|
| 27 |
| **Version** | `0.0.0` |
|
| 28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
| 29 |
-
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
|
| 30 |
-
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
|
| 31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
| 32 |
| **Sources** | n/a |
|
| 33 |
| **License** | n/a |
|
|
|
|
| 26 |
| **Name** | `en_setec_mk_tv` |
|
| 27 |
| **Version** | `0.0.0` |
|
| 28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
| 29 |
+
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
| 30 |
+
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
| 31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
| 32 |
| **Sources** | n/a |
|
| 33 |
| **License** | n/a |
|
config.cfg
CHANGED
|
@@ -10,7 +10,7 @@ seed = 0
|
|
| 10 |
|
| 11 |
[nlp]
|
| 12 |
lang = "en"
|
| 13 |
-
pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component"]
|
| 14 |
batch_size = 1000
|
| 15 |
disabled = []
|
| 16 |
before_creation = null
|
|
@@ -25,6 +25,41 @@ vectors = {"@vectors":"spacy.Vectors.v1"}
|
|
| 25 |
factory = "count_extraction_component"
|
| 26 |
label = "CONNECTION"
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
[components.ner]
|
| 29 |
factory = "ner"
|
| 30 |
incorrect_spans_key = null
|
|
@@ -48,7 +83,7 @@ upstream = "*"
|
|
| 48 |
|
| 49 |
[components.normalizer_component]
|
| 50 |
factory = "normalizer_component"
|
| 51 |
-
|
| 52 |
|
| 53 |
[components.tok2vec]
|
| 54 |
factory = "tok2vec"
|
|
|
|
| 10 |
|
| 11 |
[nlp]
|
| 12 |
lang = "en"
|
| 13 |
+
pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component","feature_aggregator_component"]
|
| 14 |
batch_size = 1000
|
| 15 |
disabled = []
|
| 16 |
before_creation = null
|
|
|
|
| 25 |
factory = "count_extraction_component"
|
| 26 |
label = "CONNECTION"
|
| 27 |
|
| 28 |
+
[components.feature_aggregator_component]
|
| 29 |
+
factory = "feature_aggregator_component"
|
| 30 |
+
|
| 31 |
+
[components.feature_aggregator_component.config]
|
| 32 |
+
|
| 33 |
+
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
| 34 |
+
method = "first"
|
| 35 |
+
|
| 36 |
+
[components.feature_aggregator_component.config.COLOR]
|
| 37 |
+
method = "join"
|
| 38 |
+
|
| 39 |
+
[components.feature_aggregator_component.config.INCH]
|
| 40 |
+
method = "first"
|
| 41 |
+
|
| 42 |
+
[components.feature_aggregator_component.config.MOUNTING_FEATURE]
|
| 43 |
+
method = "join"
|
| 44 |
+
|
| 45 |
+
[components.feature_aggregator_component.config.OS]
|
| 46 |
+
method = "first"
|
| 47 |
+
|
| 48 |
+
[components.feature_aggregator_component.config.REFRESH_RATE]
|
| 49 |
+
method = "first"
|
| 50 |
+
|
| 51 |
+
[components.feature_aggregator_component.config.RESOLUTION]
|
| 52 |
+
method = "first"
|
| 53 |
+
|
| 54 |
+
[components.feature_aggregator_component.config.SOFTWARE_FEATURE]
|
| 55 |
+
method = "expand"
|
| 56 |
+
|
| 57 |
+
[components.feature_aggregator_component.config.VIDEO_FEATURE]
|
| 58 |
+
method = "expand"
|
| 59 |
+
|
| 60 |
+
[components.feature_aggregator_component.config.WIRELESS_FEATURE]
|
| 61 |
+
method = "expand"
|
| 62 |
+
|
| 63 |
[components.ner]
|
| 64 |
factory = "ner"
|
| 65 |
incorrect_spans_key = null
|
|
|
|
| 83 |
|
| 84 |
[components.normalizer_component]
|
| 85 |
factory = "normalizer_component"
|
| 86 |
+
norms = null
|
| 87 |
|
| 88 |
[components.tok2vec]
|
| 89 |
factory = "tok2vec"
|
en_setec_mk_tv-any-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95caa99fc72a38765852ae3e92072e5c28bf4357f166a16a2cfff6969b5c03e9
|
| 3 |
+
size 5709056
|
feature_aggregator_component.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from spacy.tokens import Doc, Span, Token
|
| 2 |
+
from spacy.language import Language
|
| 3 |
+
|
| 4 |
+
default_feature_aggregation_config = {
|
| 5 |
+
'AUDIO_FEATURE': {
|
| 6 |
+
'method': 'first',
|
| 7 |
+
},
|
| 8 |
+
'COLOR': {
|
| 9 |
+
'method': 'join',
|
| 10 |
+
},
|
| 11 |
+
'INCH': {
|
| 12 |
+
'method': 'first',
|
| 13 |
+
},
|
| 14 |
+
'MOUNTING_FEATURE': {
|
| 15 |
+
'method': 'join',
|
| 16 |
+
},
|
| 17 |
+
'OS': {
|
| 18 |
+
'method': 'first',
|
| 19 |
+
},
|
| 20 |
+
'REFRESH_RATE': {
|
| 21 |
+
'method': 'first',
|
| 22 |
+
},
|
| 23 |
+
'RESOLUTION': {
|
| 24 |
+
'method': 'first',
|
| 25 |
+
},
|
| 26 |
+
'SOFTWARE_FEATURE': {
|
| 27 |
+
'method': 'expand',
|
| 28 |
+
},
|
| 29 |
+
'VIDEO_FEATURE': {
|
| 30 |
+
'method': 'expand',
|
| 31 |
+
},
|
| 32 |
+
'WIRELESS_FEATURE': {
|
| 33 |
+
'method': 'expand',
|
| 34 |
+
},
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
@Language.factory("feature_aggregator_component")
|
| 38 |
+
class FeatureAggregatorComponent(object):
|
| 39 |
+
def __init__(self, nlp, name, config=default_feature_aggregation_config):
|
| 40 |
+
self.config = config
|
| 41 |
+
Doc.set_extension("raw_features", getter=self.raw_features, force=True)
|
| 42 |
+
Doc.set_extension("features", getter=self.features, force=True)
|
| 43 |
+
Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
|
| 44 |
+
Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)
|
| 45 |
+
|
| 46 |
+
def __call__(self, doc):
|
| 47 |
+
return doc
|
| 48 |
+
|
| 49 |
+
def get_feature_aggregation_config(self, doc):
|
| 50 |
+
return self.config
|
| 51 |
+
|
| 52 |
+
def set_feature_aggregation_config(self, doc, config):
|
| 53 |
+
self.config = config
|
| 54 |
+
|
| 55 |
+
def raw_features(self, doc):
|
| 56 |
+
features = {}
|
| 57 |
+
for ent in doc.ents:
|
| 58 |
+
if ent._.count is None:
|
| 59 |
+
if not ent.label_ in features:
|
| 60 |
+
features[ent.label_] = set()
|
| 61 |
+
features[ent.label_].add(ent._.text)
|
| 62 |
+
else:
|
| 63 |
+
# If it has a count we put it in a separate column an accumulate the counts
|
| 64 |
+
if not ent._.text in features:
|
| 65 |
+
features[ent._.text] = 0
|
| 66 |
+
features[ent._.text] += ent._.count
|
| 67 |
+
return features
|
| 68 |
+
|
| 69 |
+
def features(self, doc):
|
| 70 |
+
features = {}
|
| 71 |
+
for name, values in self.raw_features(doc).items():
|
| 72 |
+
if not name in self.config:
|
| 73 |
+
features[name] = values
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
if not 'method' in self.config[name]:
|
| 77 |
+
features[name] = values
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
method = self.config[name]["method"]
|
| 81 |
+
if method == 'first':
|
| 82 |
+
if len(values) != 0:
|
| 83 |
+
features[name] = values.pop()
|
| 84 |
+
else:
|
| 85 |
+
features[name] = float('nan')
|
| 86 |
+
elif method == 'join':
|
| 87 |
+
features[name] = ','.join(list(values))
|
| 88 |
+
elif method == 'expand':
|
| 89 |
+
for value in values:
|
| 90 |
+
features[value] = 1
|
| 91 |
+
else:
|
| 92 |
+
print(f"unknown feature aggregation method: {method}, skipping...")
|
| 93 |
+
features[name] = values
|
| 94 |
+
return features
|
| 95 |
+
|
| 96 |
+
def add_to_dataframe(self, doc, df):
|
| 97 |
+
features = self.features(doc)
|
| 98 |
+
for name, feature in features.items():
|
| 99 |
+
features[name] = [feature]
|
| 100 |
+
df = pd.concat([df, pd.DataFrame(features)])
|
| 101 |
+
return df
|
meta.json
CHANGED
|
@@ -37,13 +37,15 @@
|
|
| 37 |
"tok2vec",
|
| 38 |
"ner",
|
| 39 |
"count_extraction_component",
|
| 40 |
-
"normalizer_component"
|
|
|
|
| 41 |
],
|
| 42 |
"components":[
|
| 43 |
"tok2vec",
|
| 44 |
"ner",
|
| 45 |
"count_extraction_component",
|
| 46 |
-
"normalizer_component"
|
|
|
|
| 47 |
],
|
| 48 |
"disabled":[
|
| 49 |
|
|
|
|
| 37 |
"tok2vec",
|
| 38 |
"ner",
|
| 39 |
"count_extraction_component",
|
| 40 |
+
"normalizer_component",
|
| 41 |
+
"feature_aggregator_component"
|
| 42 |
],
|
| 43 |
"components":[
|
| 44 |
"tok2vec",
|
| 45 |
"ner",
|
| 46 |
"count_extraction_component",
|
| 47 |
+
"normalizer_component",
|
| 48 |
+
"feature_aggregator_component"
|
| 49 |
],
|
| 50 |
"disabled":[
|
| 51 |
|
normalizer_component.py
CHANGED
|
@@ -50,19 +50,19 @@ default_normalization_table = {
|
|
| 50 |
"1280x720": ["HD"],
|
| 51 |
"640x480": ["SD"],
|
| 52 |
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
|
| 53 |
-
"
|
| 54 |
}
|
| 55 |
|
| 56 |
|
| 57 |
@Language.factory("normalizer_component")
|
| 58 |
class NormalizerComponent(object):
|
| 59 |
-
def __init__(self, nlp, name,
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
| 68 |
self.nlp = nlp
|
|
|
|
| 50 |
"1280x720": ["HD"],
|
| 51 |
"640x480": ["SD"],
|
| 52 |
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
|
| 53 |
+
"Blutooth": ["BLUETOOTH"],
|
| 54 |
}
|
| 55 |
|
| 56 |
|
| 57 |
@Language.factory("normalizer_component")
|
| 58 |
class NormalizerComponent(object):
|
| 59 |
+
def __init__(self, nlp, name, norms=None):
|
| 60 |
+
if norms is None:
|
| 61 |
+
self.norm_table = default_normalization_table
|
| 62 |
+
elif isinstance(norms, str):
|
| 63 |
+
self.norm_table = json.load(open(norms))
|
| 64 |
+
else:
|
| 65 |
+
self.norm_table = norms
|
| 66 |
|
| 67 |
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
| 68 |
self.nlp = nlp
|