Update spaCy pipeline
Browse files- README.md +4 -2
- config.cfg +1 -1
- count_extraction_component.py +1 -14
- en_setec_mk_tv-any-py3-none-any.whl +2 -2
- feature_aggregator_component.py +14 -6
- meta.json +1 -1
- normalizer_component.py +1 -0
README.md
CHANGED
|
@@ -24,7 +24,7 @@ model-index:
|
|
| 24 |
| Feature | Description |
|
| 25 |
| --- | --- |
|
| 26 |
| **Name** | `en_setec_mk_tv` |
|
| 27 |
-
| **Version** | `0.0.
|
| 28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
| 29 |
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
| 30 |
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
|
@@ -51,4 +51,6 @@ model-index:
|
|
| 51 |
| --- | --- |
|
| 52 |
| `ENTS_F` | 99.18 |
|
| 53 |
| `ENTS_P` | 99.20 |
|
| 54 |
-
| `ENTS_R` | 99.16 |
|
|
|
|
|
|
|
|
|
| 24 |
| Feature | Description |
|
| 25 |
| --- | --- |
|
| 26 |
| **Name** | `en_setec_mk_tv` |
|
| 27 |
+
| **Version** | `0.0.2` |
|
| 28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
| 29 |
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
| 30 |
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
|
|
|
| 51 |
| --- | --- |
|
| 52 |
| `ENTS_F` | 99.18 |
|
| 53 |
| `ENTS_P` | 99.20 |
|
| 54 |
+
| `ENTS_R` | 99.16 |
|
| 55 |
+
| `TOK2VEC_LOSS` | 49774.20 |
|
| 56 |
+
| `NER_LOSS` | 66917.02 |
|
config.cfg
CHANGED
|
@@ -31,7 +31,7 @@ factory = "feature_aggregator_component"
|
|
| 31 |
[components.feature_aggregator_component.config]
|
| 32 |
|
| 33 |
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
| 34 |
-
method = "
|
| 35 |
|
| 36 |
[components.feature_aggregator_component.config.COLOR]
|
| 37 |
method = "join"
|
|
|
|
| 31 |
[components.feature_aggregator_component.config]
|
| 32 |
|
| 33 |
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
| 34 |
+
method = "expand"
|
| 35 |
|
| 36 |
[components.feature_aggregator_component.config.COLOR]
|
| 37 |
method = "join"
|
count_extraction_component.py
CHANGED
|
@@ -7,6 +7,7 @@ import re
|
|
| 7 |
# https://spacy.io/usage/processing-pipelines#custom-components
|
| 8 |
@Language.factory("count_extraction_component")
|
| 9 |
class CountExtractorComponent(object):
|
|
|
|
| 10 |
def __init__(self, nlp, name, label="CONNECTION"):
|
| 11 |
self.label = label
|
| 12 |
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
|
|
@@ -16,8 +17,6 @@ class CountExtractorComponent(object):
|
|
| 16 |
# set extensions to tokens, spans and docs
|
| 17 |
Span.set_extension("count", default=None, force=True)
|
| 18 |
Span.set_extension("text", default=None, force=True)
|
| 19 |
-
|
| 20 |
-
Doc.set_extension("connections", getter=self.connections, force=True)
|
| 21 |
|
| 22 |
def __call__(self, doc):
|
| 23 |
for ent in doc.ents:
|
|
@@ -50,15 +49,3 @@ class CountExtractorComponent(object):
|
|
| 50 |
ent._.text = text
|
| 51 |
ent._.count = 1
|
| 52 |
return doc
|
| 53 |
-
|
| 54 |
-
def connections(self, doc):
|
| 55 |
-
connections = {}
|
| 56 |
-
for ent in doc.ents:
|
| 57 |
-
if ent._.count is None:
|
| 58 |
-
continue
|
| 59 |
-
|
| 60 |
-
if ent._.text not in connections:
|
| 61 |
-
connections[ent._.text] = ent._.count
|
| 62 |
-
continue
|
| 63 |
-
connections[ent._.text] += ent._.count
|
| 64 |
-
return connections
|
|
|
|
| 7 |
# https://spacy.io/usage/processing-pipelines#custom-components
|
| 8 |
@Language.factory("count_extraction_component")
|
| 9 |
class CountExtractorComponent(object):
|
| 10 |
+
# By default it only extracts count from CONNECTION type but this can be changed.
|
| 11 |
def __init__(self, nlp, name, label="CONNECTION"):
|
| 12 |
self.label = label
|
| 13 |
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
|
|
|
|
| 17 |
# set extensions to tokens, spans and docs
|
| 18 |
Span.set_extension("count", default=None, force=True)
|
| 19 |
Span.set_extension("text", default=None, force=True)
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def __call__(self, doc):
|
| 22 |
for ent in doc.ents:
|
|
|
|
| 49 |
ent._.text = text
|
| 50 |
ent._.count = 1
|
| 51 |
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
en_setec_mk_tv-any-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ed6b65aefd826e2c8c35c2e60bf17d929e5eaf7f450d965b260c1e1a5e1ea7f
|
| 3 |
+
size 5709306
|
feature_aggregator_component.py
CHANGED
|
@@ -2,13 +2,15 @@ from spacy.tokens import Doc, Span, Token
|
|
| 2 |
from spacy.language import Language
|
| 3 |
import pandas as pd
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
default_feature_aggregation_config = {
|
| 6 |
-
'AUDIO_FEATURE': {
|
| 7 |
-
'method': 'first',
|
| 8 |
-
},
|
| 9 |
-
'COLOR': {
|
| 10 |
-
'method': 'join',
|
| 11 |
-
},
|
| 12 |
'INCH': {
|
| 13 |
'method': 'first',
|
| 14 |
},
|
|
@@ -30,6 +32,12 @@ default_feature_aggregation_config = {
|
|
| 30 |
'VIDEO_FEATURE': {
|
| 31 |
'method': 'expand',
|
| 32 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
'WIRELESS_FEATURE': {
|
| 34 |
'method': 'expand',
|
| 35 |
},
|
|
|
|
| 2 |
from spacy.language import Language
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
# Default modes for feature extraction for the labels
|
| 6 |
+
#
|
| 7 |
+
# There are three methods:
|
| 8 |
+
# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
|
| 9 |
+
# if the resolution is reapeted we just want the first accurance.
|
| 10 |
+
# - `expand`: This effectively does OneHot encoding where the feature value names
|
| 11 |
+
# become columns and 1 is put if the feature is there.
|
| 12 |
+
# - `join`: This concatinates the feature values under feature label.
|
| 13 |
default_feature_aggregation_config = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
'INCH': {
|
| 15 |
'method': 'first',
|
| 16 |
},
|
|
|
|
| 32 |
'VIDEO_FEATURE': {
|
| 33 |
'method': 'expand',
|
| 34 |
},
|
| 35 |
+
'AUDIO_FEATURE': {
|
| 36 |
+
'method': 'expand',
|
| 37 |
+
},
|
| 38 |
+
'COLOR': {
|
| 39 |
+
'method': 'join',
|
| 40 |
+
},
|
| 41 |
'WIRELESS_FEATURE': {
|
| 42 |
'method': 'expand',
|
| 43 |
},
|
meta.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"lang":"en",
|
| 3 |
"name":"setec_mk_tv",
|
| 4 |
-
"version":"0.0.
|
| 5 |
"description":"",
|
| 6 |
"author":"",
|
| 7 |
"email":"",
|
|
|
|
| 1 |
{
|
| 2 |
"lang":"en",
|
| 3 |
"name":"setec_mk_tv",
|
| 4 |
+
"version":"0.0.2",
|
| 5 |
"description":"",
|
| 6 |
"author":"",
|
| 7 |
"email":"",
|
normalizer_component.py
CHANGED
|
@@ -3,6 +3,7 @@ import json
|
|
| 3 |
from spacy.language import Language
|
| 4 |
from spacy.matcher import PhraseMatcher
|
| 5 |
|
|
|
|
| 6 |
default_normalization_table = {
|
| 7 |
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
|
| 8 |
"Ethernet": [
|
|
|
|
| 3 |
from spacy.language import Language
|
| 4 |
from spacy.matcher import PhraseMatcher
|
| 5 |
|
| 6 |
+
# Default normalization table, that can be customized by passing it to the component as a parameter.
|
| 7 |
default_normalization_table = {
|
| 8 |
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
|
| 9 |
"Ethernet": [
|