Update spaCy pipeline
Browse files- README.md +1 -1
- base_transformer/model +2 -2
- config.cfg +11 -8
- de_trf_reflex_nrp-0.0.0-py3-none-any.whl +2 -2
- meta.json +4 -4
- ner/model +1 -1
- ner/moves +1 -1
- ner_transformer/model +2 -2
- tokenizer +0 -0
- use_custom_tokenizer.py +21 -5
- vocab/strings.json +0 -0
README.md
CHANGED
|
@@ -9,7 +9,7 @@ language:
|
|
| 9 |
| --- | --- |
|
| 10 |
| **Name** | `de_trf_reflex_nrp` |
|
| 11 |
| **Version** | `0.0.0` |
|
| 12 |
-
| **spaCy** | `>=3.8.
|
| 13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
|
| 9 |
| --- | --- |
|
| 10 |
| **Name** | `de_trf_reflex_nrp` |
|
| 11 |
| **Version** | `0.0.0` |
|
| 12 |
+
| **spaCy** | `>=3.8.7,<3.9.0` |
|
| 13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
| 15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
base_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe87b1b20d663c6b3a4639ac363c010d54c8dd148277205284de0dfefbac246c
|
| 3 |
+
size 440759602
|
config.cfg
CHANGED
|
@@ -16,7 +16,7 @@ before_creation = null
|
|
| 16 |
after_creation = null
|
| 17 |
after_pipeline_creation = null
|
| 18 |
batch_size = 1000
|
| 19 |
-
tokenizer = {"@tokenizers":"
|
| 20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
|
| 22 |
[components]
|
|
@@ -228,21 +228,24 @@ eps = 0.00000001
|
|
| 228 |
learn_rate = 0.001
|
| 229 |
|
| 230 |
[training.score_weights]
|
| 231 |
-
ents_f = 0.
|
| 232 |
ents_p = 0.0
|
| 233 |
ents_r = 0.0
|
| 234 |
ents_per_type = null
|
| 235 |
-
pos_acc = 0.
|
| 236 |
-
morph_acc = 0.
|
| 237 |
morph_per_feat = null
|
| 238 |
-
tag_acc = 0.
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
| 241 |
dep_las_per_type = null
|
| 242 |
sents_p = null
|
| 243 |
sents_r = null
|
| 244 |
sents_f = 0.0
|
| 245 |
-
lemma_acc = 0.
|
| 246 |
|
| 247 |
[pretraining]
|
| 248 |
|
|
|
|
| 16 |
after_creation = null
|
| 17 |
after_pipeline_creation = null
|
| 18 |
batch_size = 1000
|
| 19 |
+
tokenizer = {"@tokenizers":"customize_tokenizer"}
|
| 20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
|
| 22 |
[components]
|
|
|
|
| 228 |
learn_rate = 0.001
|
| 229 |
|
| 230 |
[training.score_weights]
|
| 231 |
+
ents_f = 0.22
|
| 232 |
ents_p = 0.0
|
| 233 |
ents_r = 0.0
|
| 234 |
ents_per_type = null
|
| 235 |
+
pos_acc = 0.0
|
| 236 |
+
morph_acc = 0.11
|
| 237 |
morph_per_feat = null
|
| 238 |
+
tag_acc = 0.22
|
| 239 |
+
tag_micro_p = null
|
| 240 |
+
tag_micro_r = null
|
| 241 |
+
tag_micro_f = null
|
| 242 |
+
dep_uas = 0.11
|
| 243 |
+
dep_las = 0.11
|
| 244 |
dep_las_per_type = null
|
| 245 |
sents_p = null
|
| 246 |
sents_r = null
|
| 247 |
sents_f = 0.0
|
| 248 |
+
lemma_acc = 0.22
|
| 249 |
|
| 250 |
[pretraining]
|
| 251 |
|
de_trf_reflex_nrp-0.0.0-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8847b573ed146775d3d3258db3ae3c39f23920d3c5e1117d0db18d774380eb0f
|
| 3 |
+
size 865731526
|
meta.json
CHANGED
|
@@ -7,8 +7,8 @@
|
|
| 7 |
"email":"",
|
| 8 |
"url":"",
|
| 9 |
"license":"",
|
| 10 |
-
"spacy_version":">=3.8.
|
| 11 |
-
"spacy_git_version":"
|
| 12 |
"vectors":{
|
| 13 |
"width":0,
|
| 14 |
"vectors":0,
|
|
@@ -1849,7 +1849,7 @@
|
|
| 1849 |
|
| 1850 |
],
|
| 1851 |
"requirements":[
|
| 1852 |
-
"spacy-transformers>=1.3.
|
| 1853 |
-
"spacy>=3.8.
|
| 1854 |
]
|
| 1855 |
}
|
|
|
|
| 7 |
"email":"",
|
| 8 |
"url":"",
|
| 9 |
"license":"",
|
| 10 |
+
"spacy_version":">=3.8.7,<3.9.0",
|
| 11 |
+
"spacy_git_version":"4b65aa7",
|
| 12 |
"vectors":{
|
| 13 |
"width":0,
|
| 14 |
"vectors":0,
|
|
|
|
| 1849 |
|
| 1850 |
],
|
| 1851 |
"requirements":[
|
| 1852 |
+
"spacy-transformers>=1.3.9,<1.4.0",
|
| 1853 |
+
"spacy>=3.8.7,<3.9.0"
|
| 1854 |
]
|
| 1855 |
}
|
ner/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 219690
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:969c0055d7b1e700d7ad7b138e74aba275f3a5adb3db1c5692047e4d60ff58e9
|
| 3 |
size 219690
|
ner/moves
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
��movesٴ{"0":{},"1":{"
|
|
|
|
| 1 |
+
��movesٴ{"0":{},"1":{"LOC":5568,"ORG":5550,"PER":4662},"2":{"LOC":5568,"ORG":5550,"PER":4662},"3":{"LOC":5568,"ORG":5550,"PER":4662},"4":{"LOC":5568,"ORG":5550,"PER":4662,"":1},"5":{"":1}}�cfg��neg_key�
|
ner_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c94683b8cce69d51530a2a91de1a4dee720b1dc60e2abbdb702f9d0d08db7fd
|
| 3 |
+
size 440759609
|
tokenizer
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
use_custom_tokenizer.py
CHANGED
|
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
DOT_AFTER_WORD = [
|
| 28 |
-
rf"(?<!www\.)(?<=([
|
| 29 |
for i in range(3, 30)
|
| 30 |
]
|
| 31 |
|
|
|
|
| 32 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 33 |
|
| 34 |
infix_res = [
|
| 35 |
-
r"[\(\[\]\)]",
|
| 36 |
r"(?<=\.--)\.", # DOT after .--
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 39 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 40 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
|
|
| 45 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 46 |
]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
suffix_res = [
|
| 49 |
r"(?<=\d)[\.]", # DOT after number
|
| 50 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
|
|
| 63 |
|
| 64 |
|
| 65 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 66 |
-
def custom_tokenizer(nlp):
|
| 67 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 71 |
|
| 72 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
|
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
| 27 |
+
DOT_BEFORE_WORD = [
|
| 28 |
+
rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
|
| 29 |
+
for i in range(3, 30)
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
DOT_AFTER_WORD = [
|
| 34 |
+
rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
|
| 35 |
for i in range(3, 30)
|
| 36 |
]
|
| 37 |
|
| 38 |
+
|
| 39 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 40 |
|
| 41 |
infix_res = [
|
| 42 |
+
r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
|
| 43 |
r"(?<=\.--)\.", # DOT after .--
|
| 44 |
+
r"(?<=\,)\'",
|
| 45 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 46 |
+
# rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
|
| 47 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 48 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 49 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
|
|
| 54 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 55 |
]
|
| 56 |
|
| 57 |
+
prefix_res = [
|
| 58 |
+
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
|
| 59 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
suffix_res = [
|
| 63 |
r"(?<=\d)[\.]", # DOT after number
|
| 64 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 80 |
+
def custom_tokenizer(nlp) -> Tokenizer:
|
| 81 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 82 |
|
| 83 |
+
prefixes = nlp.Defaults.prefixes
|
| 84 |
+
prefixes += prefix_res
|
| 85 |
+
prefix_regex = compile_prefix_regex(prefixes)
|
| 86 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 87 |
|
| 88 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
vocab/strings.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|