Update spaCy pipeline
Browse files- README.md +2 -2
- __pycache__/use_custom_tokenizer.cpython-312.pyc +0 -0
- base_transformer/model +1 -1
- config.cfg +1 -4
- it_trf_nrp-any-py3-none-any.whl +2 -2
- meta.json +0 -2
- ner_transformer/model +1 -1
- use_custom_tokenizer.py +5 -6
README.md
CHANGED
|
@@ -26,8 +26,8 @@ model-index:
|
|
| 26 |
| **Name** | `it_trf_nrp` |
|
| 27 |
| **Version** | `0.0.0` |
|
| 28 |
| **spaCy** | `>=3.8.3,<3.9.0` |
|
| 29 |
-
| **Default Pipeline** | `ner_transformer`, `ner`, `
|
| 30 |
-
| **Components** | `ner_transformer`, `ner`, `
|
| 31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
| 32 |
| **Sources** | n/a |
|
| 33 |
| **License** | n/a |
|
|
|
|
| 26 |
| **Name** | `it_trf_nrp` |
|
| 27 |
| **Version** | `0.0.0` |
|
| 28 |
| **spaCy** | `>=3.8.3,<3.9.0` |
|
| 29 |
+
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 30 |
+
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
| 32 |
| **Sources** | n/a |
|
| 33 |
| **License** | n/a |
|
__pycache__/use_custom_tokenizer.cpython-312.pyc
ADDED
|
Binary file (1.04 kB). View file
|
|
|
base_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 443821706
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:382b251bd81e6c65fe4033674bc76b90eee93409d6c567aafbfc11d94c2e22cc
|
| 3 |
size 443821706
|
config.cfg
CHANGED
|
@@ -10,7 +10,7 @@ seed = 17
|
|
| 10 |
|
| 11 |
[nlp]
|
| 12 |
lang = "it"
|
| 13 |
-
pipeline = ["ner_transformer","ner","
|
| 14 |
batch_size = 512
|
| 15 |
disabled = []
|
| 16 |
before_creation = null
|
|
@@ -43,9 +43,6 @@ use_fast = true
|
|
| 43 |
|
| 44 |
[components.base_transformer.model.transformer_config]
|
| 45 |
|
| 46 |
-
[components.merge_entities]
|
| 47 |
-
factory = "merge_entities"
|
| 48 |
-
|
| 49 |
[components.morphologizer]
|
| 50 |
factory = "morphologizer"
|
| 51 |
extend = false
|
|
|
|
| 10 |
|
| 11 |
[nlp]
|
| 12 |
lang = "it"
|
| 13 |
+
pipeline = ["ner_transformer","ner","base_transformer","morphologizer","tagger","parser","trainable_lemmatizer"]
|
| 14 |
batch_size = 512
|
| 15 |
disabled = []
|
| 16 |
before_creation = null
|
|
|
|
| 43 |
|
| 44 |
[components.base_transformer.model.transformer_config]
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
[components.morphologizer]
|
| 47 |
factory = "morphologizer"
|
| 48 |
extend = false
|
it_trf_nrp-any-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1c331f6965c59e36deb2bb546991791a0055e2abd1c39a00bda81d5590bdacd
|
| 3 |
+
size 825600865
|
meta.json
CHANGED
|
@@ -827,7 +827,6 @@
|
|
| 827 |
"pipeline":[
|
| 828 |
"ner_transformer",
|
| 829 |
"ner",
|
| 830 |
-
"merge_entities",
|
| 831 |
"base_transformer",
|
| 832 |
"morphologizer",
|
| 833 |
"tagger",
|
|
@@ -837,7 +836,6 @@
|
|
| 837 |
"components":[
|
| 838 |
"ner_transformer",
|
| 839 |
"ner",
|
| 840 |
-
"merge_entities",
|
| 841 |
"base_transformer",
|
| 842 |
"morphologizer",
|
| 843 |
"tagger",
|
|
|
|
| 827 |
"pipeline":[
|
| 828 |
"ner_transformer",
|
| 829 |
"ner",
|
|
|
|
| 830 |
"base_transformer",
|
| 831 |
"morphologizer",
|
| 832 |
"tagger",
|
|
|
|
| 836 |
"components":[
|
| 837 |
"ner_transformer",
|
| 838 |
"ner",
|
|
|
|
| 839 |
"base_transformer",
|
| 840 |
"morphologizer",
|
| 841 |
"tagger",
|
ner_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 440759145
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82e6707a6d7df1d48c0ade9bf95d437a23daf1a125a1f5457d957a29a007be3a
|
| 3 |
size 440759145
|
use_custom_tokenizer.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
from spacy.util import registry
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
custom_tokenizer,
|
| 5 |
-
)
|
| 6 |
-
|
| 7 |
|
| 8 |
@registry.tokenizers("customize_tokenizer")
|
| 9 |
def make_customize_tokenizer():
|
| 10 |
def customize_tokenizer(nlp):
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
return customize_tokenizer
|
|
|
|
| 1 |
from spacy.util import registry
|
| 2 |
+
from spacy.tokenizer import Tokenizer
|
| 3 |
+
import pathlib
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
@registry.tokenizers("customize_tokenizer")
|
| 6 |
def make_customize_tokenizer():
|
| 7 |
def customize_tokenizer(nlp):
|
| 8 |
+
tokenizer = Tokenizer(nlp.vocab)
|
| 9 |
+
script_dir = pathlib.Path(__file__).parent.resolve()
|
| 10 |
+
return tokenizer.from_disk(script_dir / "tokenizer")
|
| 11 |
|
| 12 |
return customize_tokenizer
|