Update spaCy pipeline
Browse files- .gitattributes +1 -0
- README.md +9 -9
- config.cfg +2 -2
- de_trf_ner_base_reflex_nrp-1.0.0-py3-none-any.whl +3 -0
- meta.json +16 -16
- ner/model +1 -1
- ner/moves +1 -1
- ner_transformer/model +1 -1
- tokenizer +0 -0
- use_custom_tokenizer.py +21 -5
- vocab/strings.json +0 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
de_trf_ner_base_reflex_nrp-0.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
ner/model filter=lfs diff=lfs merge=lfs -text
|
| 38 |
ner_transformer/model filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 36 |
de_trf_ner_base_reflex_nrp-0.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
ner/model filter=lfs diff=lfs merge=lfs -text
|
| 38 |
ner_transformer/model filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
de_trf_ner_base_reflex_nrp-1.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -13,18 +13,18 @@ model-index:
|
|
| 13 |
metrics:
|
| 14 |
- name: NER Precision
|
| 15 |
type: precision
|
| 16 |
-
value: 0.
|
| 17 |
- name: NER Recall
|
| 18 |
type: recall
|
| 19 |
-
value: 0.
|
| 20 |
- name: NER F Score
|
| 21 |
type: f_score
|
| 22 |
-
value: 0.
|
| 23 |
---
|
| 24 |
| Feature | Description |
|
| 25 |
| --- | --- |
|
| 26 |
| **Name** | `de_trf_ner_base_reflex_nrp` |
|
| 27 |
-
| **Version** | `
|
| 28 |
| **spaCy** | `>=3.8.3,<3.9.0` |
|
| 29 |
| **Default Pipeline** | `ner_transformer`, `ner` |
|
| 30 |
| **Components** | `ner_transformer`, `ner` |
|
|
@@ -49,8 +49,8 @@ model-index:
|
|
| 49 |
|
| 50 |
| Type | Score |
|
| 51 |
| --- | --- |
|
| 52 |
-
| `ENTS_F` | 97.
|
| 53 |
-
| `ENTS_P` | 97.
|
| 54 |
-
| `ENTS_R` | 97.
|
| 55 |
-
| `NER_TRANSFORMER_LOSS` |
|
| 56 |
-
| `NER_LOSS` |
|
|
|
|
| 13 |
metrics:
|
| 14 |
- name: NER Precision
|
| 15 |
type: precision
|
| 16 |
+
value: 0.9773609314
|
| 17 |
- name: NER Recall
|
| 18 |
type: recall
|
| 19 |
+
value: 0.9798962387
|
| 20 |
- name: NER F Score
|
| 21 |
type: f_score
|
| 22 |
+
value: 0.978626943
|
| 23 |
---
|
| 24 |
| Feature | Description |
|
| 25 |
| --- | --- |
|
| 26 |
| **Name** | `de_trf_ner_base_reflex_nrp` |
|
| 27 |
+
| **Version** | `1.0.0` |
|
| 28 |
| **spaCy** | `>=3.8.3,<3.9.0` |
|
| 29 |
| **Default Pipeline** | `ner_transformer`, `ner` |
|
| 30 |
| **Components** | `ner_transformer`, `ner` |
|
|
|
|
| 49 |
|
| 50 |
| Type | Score |
|
| 51 |
| --- | --- |
|
| 52 |
+
| `ENTS_F` | 97.86 |
|
| 53 |
+
| `ENTS_P` | 97.74 |
|
| 54 |
+
| `ENTS_R` | 97.99 |
|
| 55 |
+
| `NER_TRANSFORMER_LOSS` | 3997.89 |
|
| 56 |
+
| `NER_LOSS` | 20826.48 |
|
config.cfg
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[paths]
|
| 2 |
-
train = "./
|
| 3 |
-
dev = "./
|
| 4 |
vectors = null
|
| 5 |
init_tok2vec = null
|
| 6 |
|
|
|
|
| 1 |
[paths]
|
| 2 |
+
train = "./dataset/corpus/de/07_2025/train/combined.spacy"
|
| 3 |
+
dev = "./dataset/corpus/de/07_2025/dev/combined.spacy"
|
| 4 |
vectors = null
|
| 5 |
init_tok2vec = null
|
| 6 |
|
de_trf_ner_base_reflex_nrp-1.0.0-py3-none-any.whl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57317d04dacd709a9e5dac248f648c94952f41b178e33a14dcda9bfeeb0a7fff
|
| 3 |
+
size 408982425
|
meta.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"lang":"de",
|
| 3 |
"name":"trf_ner_base_reflex_nrp",
|
| 4 |
-
"version":"
|
| 5 |
"description":"",
|
| 6 |
"author":"",
|
| 7 |
"email":"",
|
|
@@ -37,31 +37,31 @@
|
|
| 37 |
|
| 38 |
],
|
| 39 |
"performance":{
|
| 40 |
-
"ents_f":0.
|
| 41 |
-
"ents_p":0.
|
| 42 |
-
"ents_r":0.
|
| 43 |
"ents_per_type":{
|
| 44 |
"ORG":{
|
| 45 |
-
"p":0.
|
| 46 |
-
"r":0.
|
| 47 |
-
"f":0.
|
| 48 |
},
|
| 49 |
"LOC":{
|
| 50 |
-
"p":0.
|
| 51 |
-
"r":0.
|
| 52 |
-
"f":0.
|
| 53 |
},
|
| 54 |
"PER":{
|
| 55 |
-
"p":0.
|
| 56 |
-
"r":0.
|
| 57 |
-
"f":0.
|
| 58 |
}
|
| 59 |
},
|
| 60 |
-
"ner_transformer_loss":
|
| 61 |
-
"ner_loss":
|
| 62 |
},
|
| 63 |
"requirements":[
|
| 64 |
-
"spacy-transformers>=1.3.
|
| 65 |
"spacy>=3.8.3,<3.9.0"
|
| 66 |
]
|
| 67 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"lang":"de",
|
| 3 |
"name":"trf_ner_base_reflex_nrp",
|
| 4 |
+
"version":"1.0.0",
|
| 5 |
"description":"",
|
| 6 |
"author":"",
|
| 7 |
"email":"",
|
|
|
|
| 37 |
|
| 38 |
],
|
| 39 |
"performance":{
|
| 40 |
+
"ents_f":0.978626943,
|
| 41 |
+
"ents_p":0.9773609314,
|
| 42 |
+
"ents_r":0.9798962387,
|
| 43 |
"ents_per_type":{
|
| 44 |
"ORG":{
|
| 45 |
+
"p":0.9484848485,
|
| 46 |
+
"r":0.9456193353,
|
| 47 |
+
"f":0.9470499244
|
| 48 |
},
|
| 49 |
"LOC":{
|
| 50 |
+
"p":0.9919168591,
|
| 51 |
+
"r":0.9930635838,
|
| 52 |
+
"f":0.9924898902
|
| 53 |
},
|
| 54 |
"PER":{
|
| 55 |
+
"p":0.9685714286,
|
| 56 |
+
"r":0.9797687861,
|
| 57 |
+
"f":0.974137931
|
| 58 |
}
|
| 59 |
},
|
| 60 |
+
"ner_transformer_loss":39.9788878204,
|
| 61 |
+
"ner_loss":208.264759525
|
| 62 |
},
|
| 63 |
"requirements":[
|
| 64 |
+
"spacy-transformers>=1.3.9,<1.4.0",
|
| 65 |
"spacy>=3.8.3,<3.9.0"
|
| 66 |
]
|
| 67 |
}
|
ner/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 219690
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:969c0055d7b1e700d7ad7b138e74aba275f3a5adb3db1c5692047e4d60ff58e9
|
| 3 |
size 219690
|
ner/moves
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
��movesٴ{"0":{},"1":{"
|
|
|
|
| 1 |
+
��movesٴ{"0":{},"1":{"LOC":5568,"ORG":5550,"PER":4662},"2":{"LOC":5568,"ORG":5550,"PER":4662},"3":{"LOC":5568,"ORG":5550,"PER":4662},"4":{"LOC":5568,"ORG":5550,"PER":4662,"":1},"5":{"":1}}�cfg��neg_key�
|
ner_transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 440761243
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8498589f7afcfecda66a6eb6c61bf466a18bf087d3ef8208507a60a2ecbec3e1
|
| 3 |
size 440761243
|
tokenizer
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
use_custom_tokenizer.py
CHANGED
|
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
DOT_AFTER_WORD = [
|
| 28 |
-
rf"(?<!www\.)(?<=([
|
| 29 |
for i in range(3, 30)
|
| 30 |
]
|
| 31 |
|
|
|
|
| 32 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 33 |
|
| 34 |
infix_res = [
|
| 35 |
-
r"[\(\[\]\)]",
|
| 36 |
r"(?<=\.--)\.", # DOT after .--
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 39 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 40 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
|
|
| 45 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 46 |
]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
suffix_res = [
|
| 49 |
r"(?<=\d)[\.]", # DOT after number
|
| 50 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
|
|
| 63 |
|
| 64 |
|
| 65 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 66 |
-
def custom_tokenizer(nlp):
|
| 67 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 71 |
|
| 72 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
|
|
|
| 24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
| 25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
| 26 |
|
| 27 |
+
DOT_BEFORE_WORD = [
|
| 28 |
+
rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
|
| 29 |
+
for i in range(3, 30)
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
DOT_AFTER_WORD = [
|
| 34 |
+
rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
|
| 35 |
for i in range(3, 30)
|
| 36 |
]
|
| 37 |
|
| 38 |
+
|
| 39 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
| 40 |
|
| 41 |
infix_res = [
|
| 42 |
+
r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
|
| 43 |
r"(?<=\.--)\.", # DOT after .--
|
| 44 |
+
r"(?<=\,)\'",
|
| 45 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 46 |
+
# rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
|
| 47 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
| 48 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
| 49 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
|
|
| 54 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
| 55 |
]
|
| 56 |
|
| 57 |
+
prefix_res = [
|
| 58 |
+
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
|
| 59 |
+
*DOT_BEFORE_WORD, # DOT before word
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
suffix_res = [
|
| 63 |
r"(?<=\d)[\.]", # DOT after number
|
| 64 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
| 80 |
+
def custom_tokenizer(nlp) -> Tokenizer:
|
| 81 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
| 82 |
|
| 83 |
+
prefixes = nlp.Defaults.prefixes
|
| 84 |
+
prefixes += prefix_res
|
| 85 |
+
prefix_regex = compile_prefix_regex(prefixes)
|
| 86 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
| 87 |
|
| 88 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
vocab/strings.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|