Update spaCy pipeline
Browse files- README.md +1 -1
- base_transformer/model +2 -2
- config.cfg +12 -9
- it_trf_reflex_nrp-0.0.0-py3-none-any.whl +2 -2
- meta.json +4 -4
- ner/model +1 -1
- ner/moves +1 -1
- ner_transformer/model +2 -2
- tokenizer +0 -0
- use_custom_tokenizer.py +21 -5
- vocab/strings.json +0 -0
README.md
CHANGED
|
@@ -9,7 +9,7 @@ language:
|
|
| 9 |
| --- | --- |
|
| 10 |
| **Name** | `it_trf_reflex_nrp` |
|
| 11 |
| **Version** | `0.0.0` |
|
| 12 |
-
| **spaCy** | `>=3.8.
|
| 13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
|
| 9 |
| --- | --- |
|
| 10 |
| **Name** | `it_trf_reflex_nrp` |
|
| 11 |
| **Version** | `0.0.0` |
|
| 12 |
+
| **spaCy** | `>=3.8.7,<3.9.0` |
|
| 13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
base_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:198bd1e70ebfb9931065cb51fbf76dd01f7568664d7c3f2bc59ad26eb520ad3a
|
| 3 |
+
size 443822163
|
config.cfg
CHANGED
|
@@ -16,7 +16,7 @@ before_creation = null
|
|
| 16 |
after_creation = null
|
| 17 |
after_pipeline_creation = null
|
| 18 |
batch_size = 1000
|
| 19 |
-
tokenizer = {"@tokenizers":"
|
| 20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
|
| 22 |
[components]
|
|
@@ -90,7 +90,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
|
|
| 90 |
|
| 91 |
[components.ner_transformer.model]
|
| 92 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
| 93 |
-
name = "dbmdz/bert-base-
|
| 94 |
mixed_precision = false
|
| 95 |
|
| 96 |
[components.ner_transformer.model.get_spans]
|
|
@@ -228,21 +228,24 @@ eps = 0.00000001
|
|
| 228 |
learn_rate = 0.001
|
| 229 |
|
| 230 |
[training.score_weights]
|
| 231 |
-
ents_f = 0.
|
| 232 |
ents_p = 0.0
|
| 233 |
ents_r = 0.0
|
| 234 |
ents_per_type = null
|
| 235 |
-
pos_acc = 0.
|
| 236 |
-
morph_acc = 0.
|
| 237 |
morph_per_feat = null
|
| 238 |
-
tag_acc = 0.
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
| 241 |
dep_las_per_type = null
|
| 242 |
sents_p = null
|
| 243 |
sents_r = null
|
| 244 |
sents_f = 0.0
|
| 245 |
-
lemma_acc = 0.
|
| 246 |
|
| 247 |
[pretraining]
|
| 248 |
|
|
|
|
| 16 |
after_creation = null
|
| 17 |
after_pipeline_creation = null
|
| 18 |
batch_size = 1000
|
| 19 |
+
tokenizer = {"@tokenizers":"customize_tokenizer"}
|
| 20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
|
| 22 |
[components]
|
|
|
|
| 90 |
|
| 91 |
[components.ner_transformer.model]
|
| 92 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
| 93 |
+
name = "dbmdz/bert-base-italian-xxl-cased"
|
| 94 |
mixed_precision = false
|
| 95 |
|
| 96 |
[components.ner_transformer.model.get_spans]
|
|
|
|
| 228 |
learn_rate = 0.001
|
| 229 |
|
| 230 |
[training.score_weights]
|
| 231 |
+
ents_f = 0.22
|
| 232 |
ents_p = 0.0
|
| 233 |
ents_r = 0.0
|
| 234 |
ents_per_type = null
|
| 235 |
+
pos_acc = 0.0
|
| 236 |
+
morph_acc = 0.11
|
| 237 |
morph_per_feat = null
|
| 238 |
+
tag_acc = 0.22
|
| 239 |
+
tag_micro_p = null
|
| 240 |
+
tag_micro_r = null
|
| 241 |
+
tag_micro_f = null
|
| 242 |
+
dep_uas = 0.11
|
| 243 |
+
dep_las = 0.11
|
| 244 |
dep_las_per_type = null
|
| 245 |
sents_p = null
|
| 246 |
sents_r = null
|
| 247 |
sents_f = 0.0
|
| 248 |
+
lemma_acc = 0.22
|
| 249 |
|
| 250 |
[pretraining]
|
| 251 |
|
it_trf_reflex_nrp-0.0.0-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2652e97f0f00f72f8a8f3ee70503ebaabeb8b4e07f8b925b24c1204ec63d8804
|
| 3 |
+
size 828275955
|
meta.json
CHANGED
|
@@ -7,8 +7,8 @@
|
|
| 7 |
"email":"",
|
| 8 |
"url":"",
|
| 9 |
"license":"",
|
| 10 |
-
"spacy_version":">=3.8.
|
| 11 |
-
"spacy_git_version":"
|
| 12 |
"vectors":{
|
| 13 |
"width":0,
|
| 14 |
"vectors":0,
|
|
@@ -846,7 +846,7 @@
|
|
| 846 |
|
| 847 |
],
|
| 848 |
"requirements":[
|
| 849 |
-
"spacy-transformers>=1.3.
|
| 850 |
-
"spacy>=3.8.
|
| 851 |
]
|
| 852 |
}
|
|
|
|
| 7 |
"email":"",
|
| 8 |
"url":"",
|
| 9 |
"license":"",
|
| 10 |
+
"spacy_version":">=3.8.7,<3.9.0",
|
| 11 |
+
"spacy_git_version":"4b65aa7",
|
| 12 |
"vectors":{
|
| 13 |
"width":0,
|
| 14 |
"vectors":0,
|
|
|
|
| 846 |
|
| 847 |
],
|
| 848 |
"requirements":[
|
| 849 |
+
"spacy-transformers>=1.3.9,<1.4.0",
|
| 850 |
+
"spacy>=3.8.7,<3.9.0"
|
| 851 |
]
|
| 852 |
}
|
ner/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 219690
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5032987473186a0f62555d61538ee0907d892791e1a74a01d3119edd84723f28
|
| 3 |
size 219690
|
ner/moves
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
��movesٴ{"0":{},"1":{"ORG":
|
|
|
|
| 1 |
+
��movesٴ{"0":{},"1":{"ORG":5497,"LOC":4378,"PER":3735},"2":{"ORG":5497,"LOC":4378,"PER":3735},"3":{"ORG":5497,"LOC":4378,"PER":3735},"4":{"ORG":5497,"LOC":4378,"PER":3735,"":1},"5":{"":1}}�cfg��neg_key�
|
ner_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b49959037f8c416a86998262af8eb9e38fdf1e1c0ffb41cd29d65c62475106b8
|
| 3 |
+
size 443822170
|
tokenizer
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
use_custom_tokenizer.py
CHANGED
|
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
DOT_AFTER_WORD = [
|
| 28 |
-
rf"(?<!www\.)(?<=([
|
| 29 |
for i in range(3, 30)
|
| 30 |
]
|
| 31 |
|
|
|
|
| 32 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 33 |
|
| 34 |
infix_res = [
|
| 35 |
-
r"[\(\[\]\)]",
|
| 36 |
r"(?<=\.--)\.", # DOT after .--
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 39 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 40 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
|
|
| 45 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 46 |
]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
suffix_res = [
|
| 49 |
r"(?<=\d)[\.]", # DOT after number
|
| 50 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
|
|
| 63 |
|
| 64 |
|
| 65 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 66 |
-
def custom_tokenizer(nlp):
|
| 67 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 71 |
|
| 72 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
|
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
| 27 |
+
DOT_BEFORE_WORD = [
|
| 28 |
+
rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
|
| 29 |
+
for i in range(3, 30)
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
DOT_AFTER_WORD = [
|
| 34 |
+
rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
|
| 35 |
for i in range(3, 30)
|
| 36 |
]
|
| 37 |
|
| 38 |
+
|
| 39 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 40 |
|
| 41 |
infix_res = [
|
| 42 |
+
r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
|
| 43 |
r"(?<=\.--)\.", # DOT after .--
|
| 44 |
+
r"(?<=\,)\'",
|
| 45 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 46 |
+
# rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
|
| 47 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 48 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 49 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
|
|
| 54 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 55 |
]
|
| 56 |
|
| 57 |
+
prefix_res = [
|
| 58 |
+
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
|
| 59 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
suffix_res = [
|
| 63 |
r"(?<=\d)[\.]", # DOT after number
|
| 64 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 80 |
+
def custom_tokenizer(nlp) -> Tokenizer:
|
| 81 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 82 |
|
| 83 |
+
prefixes = nlp.Defaults.prefixes
|
| 84 |
+
prefixes += prefix_res
|
| 85 |
+
prefix_regex = compile_prefix_regex(prefixes)
|
| 86 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 87 |
|
| 88 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
vocab/strings.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|