Lazy-Val commited on
Commit
c426c45
·
verified ·
1 Parent(s): 11cdbc3

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  de_trf_ner_base_reflex_nrp-0.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
  ner/model filter=lfs diff=lfs merge=lfs -text
38
  ner_transformer/model filter=lfs diff=lfs merge=lfs -text
 
 
36
  de_trf_ner_base_reflex_nrp-0.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
  ner/model filter=lfs diff=lfs merge=lfs -text
38
  ner_transformer/model filter=lfs diff=lfs merge=lfs -text
39
+ de_trf_ner_base_reflex_nrp-1.0.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -13,18 +13,18 @@ model-index:
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
- value: 0.9710610932
17
  - name: NER Recall
18
  type: recall
19
- value: 0.971842317
20
  - name: NER F Score
21
  type: f_score
22
- value: 0.971451548
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `de_trf_ner_base_reflex_nrp` |
27
- | **Version** | `0.0.0` |
28
  | **spaCy** | `>=3.8.3,<3.9.0` |
29
  | **Default Pipeline** | `ner_transformer`, `ner` |
30
  | **Components** | `ner_transformer`, `ner` |
@@ -49,8 +49,8 @@ model-index:
49
 
50
  | Type | Score |
51
  | --- | --- |
52
- | `ENTS_F` | 97.15 |
53
- | `ENTS_P` | 97.11 |
54
- | `ENTS_R` | 97.18 |
55
- | `NER_TRANSFORMER_LOSS` | 4331.69 |
56
- | `NER_LOSS` | 33002.60 |
 
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
+ value: 0.9773609314
17
  - name: NER Recall
18
  type: recall
19
+ value: 0.9798962387
20
  - name: NER F Score
21
  type: f_score
22
+ value: 0.978626943
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `de_trf_ner_base_reflex_nrp` |
27
+ | **Version** | `1.0.0` |
28
  | **spaCy** | `>=3.8.3,<3.9.0` |
29
  | **Default Pipeline** | `ner_transformer`, `ner` |
30
  | **Components** | `ner_transformer`, `ner` |
 
49
 
50
  | Type | Score |
51
  | --- | --- |
52
+ | `ENTS_F` | 97.86 |
53
+ | `ENTS_P` | 97.74 |
54
+ | `ENTS_R` | 97.99 |
55
+ | `NER_TRANSFORMER_LOSS` | 3997.89 |
56
+ | `NER_LOSS` | 20826.48 |
config.cfg CHANGED
@@ -1,6 +1,6 @@
1
  [paths]
2
- train = "./data/raw/training/02-2025/de/train/combined.spacy"
3
- dev = "./data/raw/training/02-2025/de/dev/combined.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
 
1
  [paths]
2
+ train = "./dataset/corpus/de/07_2025/train/combined.spacy"
3
+ dev = "./dataset/corpus/de/07_2025/dev/combined.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
de_trf_ner_base_reflex_nrp-1.0.0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57317d04dacd709a9e5dac248f648c94952f41b178e33a14dcda9bfeeb0a7fff
3
+ size 408982425
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"de",
3
  "name":"trf_ner_base_reflex_nrp",
4
- "version":"0.0.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
@@ -37,31 +37,31 @@
37
 
38
  ],
39
  "performance":{
40
- "ents_f":0.971451548,
41
- "ents_p":0.9710610932,
42
- "ents_r":0.971842317,
43
  "ents_per_type":{
44
  "ORG":{
45
- "p":0.9395973154,
46
- "r":0.9523809524,
47
- "f":0.9459459459
48
  },
49
  "LOC":{
50
- "p":0.981186686,
51
- "r":0.9755395683,
52
- "f":0.9783549784
53
  },
54
  "PER":{
55
- "p":0.9803921569,
56
- "r":0.9842519685,
57
- "f":0.9823182711
58
  }
59
  },
60
- "ner_transformer_loss":43.3168994993,
61
- "ner_loss":330.0260414947
62
  },
63
  "requirements":[
64
- "spacy-transformers>=1.3.8,<1.4.0",
65
  "spacy>=3.8.3,<3.9.0"
66
  ]
67
  }
 
1
  {
2
  "lang":"de",
3
  "name":"trf_ner_base_reflex_nrp",
4
+ "version":"1.0.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
37
 
38
  ],
39
  "performance":{
40
+ "ents_f":0.978626943,
41
+ "ents_p":0.9773609314,
42
+ "ents_r":0.9798962387,
43
  "ents_per_type":{
44
  "ORG":{
45
+ "p":0.9484848485,
46
+ "r":0.9456193353,
47
+ "f":0.9470499244
48
  },
49
  "LOC":{
50
+ "p":0.9919168591,
51
+ "r":0.9930635838,
52
+ "f":0.9924898902
53
  },
54
  "PER":{
55
+ "p":0.9685714286,
56
+ "r":0.9797687861,
57
+ "f":0.974137931
58
  }
59
  },
60
+ "ner_transformer_loss":39.9788878204,
61
+ "ner_loss":208.264759525
62
  },
63
  "requirements":[
64
+ "spacy-transformers>=1.3.9,<1.4.0",
65
  "spacy>=3.8.3,<3.9.0"
66
  ]
67
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0693e3662125cda25380a809a5937f1c78ce427c643bf0eb5973c78a99b8389
3
  size 219690
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969c0055d7b1e700d7ad7b138e74aba275f3a5adb3db1c5692047e4d60ff58e9
3
  size 219690
ner/moves CHANGED
@@ -1 +1 @@
1
- ��movesٴ{"0":{},"1":{"ORG":5064,"LOC":4932,"PER":4078},"2":{"ORG":5064,"LOC":4932,"PER":4078},"3":{"ORG":5064,"LOC":4932,"PER":4078},"4":{"ORG":5064,"LOC":4932,"PER":4078,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��movesٴ{"0":{},"1":{"LOC":5568,"ORG":5550,"PER":4662},"2":{"LOC":5568,"ORG":5550,"PER":4662},"3":{"LOC":5568,"ORG":5550,"PER":4662},"4":{"LOC":5568,"ORG":5550,"PER":4662,"":1},"5":{"":1}}�cfg��neg_key�
ner_transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:022d8220abeb5033522347c50e4b27ed090f01329dbec63b0a656bb81e63df1f
3
  size 440761243
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8498589f7afcfecda66a6eb6c61bf466a18bf087d3ef8208507a60a2ecbec3e1
3
  size 440761243
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
use_custom_tokenizer.py CHANGED
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
24
  DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
25
  TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
26
 
 
 
 
 
 
 
27
  DOT_AFTER_WORD = [
28
- rf"(?<!www\.)(?<=([a-zA-ZäöüÄÖÜ]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
29
  for i in range(3, 30)
30
  ]
31
 
 
32
  DOT_AFTER_DATE = rf"(?<=({DATE}))\."
33
 
34
  infix_res = [
35
- r"[\(\[\]\)]",
36
  r"(?<=\.--)\.", # DOT after .--
37
- rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
 
 
38
  r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
39
  *DOT_AFTER_WORD, # when there is no space after the dot
40
  r"[A-Z](?=\. )", # DOT after capital letter
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
45
  rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
46
  ]
47
 
 
 
 
 
 
48
  suffix_res = [
49
  r"(?<=\d)[\.]", # DOT after number
50
  r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
63
 
64
 
65
  # Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
66
- def custom_tokenizer(nlp):
67
  nlp.tokenizer = Tokenizer(nlp.vocab)
68
 
69
- prefix_regex = compile_prefix_regex(nlp.Defaults.prefixes)
 
 
70
  nlp.tokenizer.prefix_search = prefix_regex.search
71
 
72
  # We use the default infixes and remove some cases that lead to unwanted tokenization.
 
24
  DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
25
  TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
26
 
27
+ DOT_BEFORE_WORD = [
28
+ rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
29
+ for i in range(3, 30)
30
+ ]
31
+
32
+
33
  DOT_AFTER_WORD = [
34
+ rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
35
  for i in range(3, 30)
36
  ]
37
 
38
+
39
  DOT_AFTER_DATE = rf"(?<=({DATE}))\."
40
 
41
  infix_res = [
42
+ r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
43
  r"(?<=\.--)\.", # DOT after .--
44
+ r"(?<=\,)\'",
45
+ *DOT_BEFORE_WORD, # DOT before word
46
+ # rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
47
  r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
48
  *DOT_AFTER_WORD, # when there is no space after the dot
49
  r"[A-Z](?=\. )", # DOT after capital letter
 
54
  rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
55
  ]
56
 
57
+ prefix_res = [
58
+ rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
59
+ *DOT_BEFORE_WORD, # DOT before word
60
+ ]
61
+
62
  suffix_res = [
63
  r"(?<=\d)[\.]", # DOT after number
64
  r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
 
77
 
78
 
79
  # Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
80
+ def custom_tokenizer(nlp) -> Tokenizer:
81
  nlp.tokenizer = Tokenizer(nlp.vocab)
82
 
83
+ prefixes = nlp.Defaults.prefixes
84
+ prefixes += prefix_res
85
+ prefix_regex = compile_prefix_regex(prefixes)
86
  nlp.tokenizer.prefix_search = prefix_regex.search
87
 
88
  # We use the default infixes and remove some cases that lead to unwanted tokenization.
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff