Update spaCy pipeline
Browse files- README.md +12 -10
- config.cfg +15 -13
- meta.json +12 -10
- relationFactory.py +27 -12
- relation_extractor/model +0 -0
- ru_patents_rel_tiny-any-py3-none-any.whl +2 -2
- transformer/cfg +1 -1
- transformer/model +2 -2
README.md
CHANGED
|
@@ -35,15 +35,17 @@ model-index:
|
|
| 35 |
|
| 36 |
| Type | Score |
|
| 37 |
| --- | --- |
|
| 38 |
-
| `REL_MICRO_P` |
|
| 39 |
-
| `REL_MICRO_R` |
|
| 40 |
-
| `REL_MICRO_F` |
|
| 41 |
-
| `REL_MACRO_F` |
|
| 42 |
-
| `REL_WEIGHTED_F` |
|
| 43 |
| `F1_PART-OF` | 37.96 |
|
| 44 |
-
| `F1_LOCATED-AT` |
|
| 45 |
-
| `F1_CONNECTED-WITH` |
|
| 46 |
| `F1_IN-MANNER-OF` | 0.00 |
|
| 47 |
-
| `F1_ATTRIBUTE-FOR` |
|
| 48 |
-
| `
|
| 49 |
-
| `
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
| Type | Score |
|
| 37 |
| --- | --- |
|
| 38 |
+
| `REL_MICRO_P` | 46.91 |
|
| 39 |
+
| `REL_MICRO_R` | 15.40 |
|
| 40 |
+
| `REL_MICRO_F` | 23.18 |
|
| 41 |
+
| `REL_MACRO_F` | 12.91 |
|
| 42 |
+
| `REL_WEIGHTED_F` | 21.04 |
|
| 43 |
| `F1_PART-OF` | 37.96 |
|
| 44 |
+
| `F1_LOCATED-AT` | 12.87 |
|
| 45 |
+
| `F1_CONNECTED-WITH` | 5.75 |
|
| 46 |
| `F1_IN-MANNER-OF` | 0.00 |
|
| 47 |
+
| `F1_ATTRIBUTE-FOR` | 7.94 |
|
| 48 |
+
| `F1_MACRO` | 0.00 |
|
| 49 |
+
| `F1_WEIGHTED` | 0.00 |
|
| 50 |
+
| `TRANSFORMER_LOSS` | 2.90 |
|
| 51 |
+
| `RELATION_EXTRACTOR_LOSS` | 132.27 |
|
config.cfg
CHANGED
|
@@ -17,7 +17,7 @@ before_creation = null
|
|
| 17 |
after_creation = null
|
| 18 |
after_pipeline_creation = null
|
| 19 |
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
| 20 |
-
batch_size =
|
| 21 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 22 |
|
| 23 |
[components]
|
|
@@ -41,7 +41,7 @@ pooling = {"@layers":"reduce_mean.v1"}
|
|
| 41 |
|
| 42 |
[components.relation_extractor.model.create_instance_tensor.get_instances]
|
| 43 |
@misc = "rel_instance_generator.v1"
|
| 44 |
-
max_length =
|
| 45 |
|
| 46 |
[components.relation_extractor.model.create_instance_tensor.tok2vec]
|
| 47 |
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
@@ -51,7 +51,7 @@ upstream = "*"
|
|
| 51 |
|
| 52 |
[components.transformer]
|
| 53 |
factory = "transformer"
|
| 54 |
-
max_batch_items =
|
| 55 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
| 56 |
|
| 57 |
[components.transformer.model]
|
|
@@ -88,8 +88,8 @@ dropout = 0.2
|
|
| 88 |
accumulate_gradient = 1
|
| 89 |
patience = 1600000
|
| 90 |
max_epochs = 0
|
| 91 |
-
max_steps =
|
| 92 |
-
eval_frequency =
|
| 93 |
frozen_components = []
|
| 94 |
dev_corpus = "corpora.dev"
|
| 95 |
train_corpus = "corpora.train"
|
|
@@ -127,14 +127,16 @@ initial_rate = 0.00005
|
|
| 127 |
[training.score_weights]
|
| 128 |
rel_micro_p = 0.0
|
| 129 |
rel_micro_r = 0.0
|
| 130 |
-
rel_micro_f = 0.
|
| 131 |
-
rel_macro_f = 0.
|
| 132 |
-
rel_weighted_f = 0.
|
| 133 |
-
f1_PART-OF = 0.
|
| 134 |
-
f1_LOCATED-AT = 0.
|
| 135 |
-
f1_CONNECTED-WITH = 0.
|
| 136 |
-
f1_IN-MANNER-OF = 0.
|
| 137 |
-
f1_ATTRIBUTE-FOR = 0.
|
|
|
|
|
|
|
| 138 |
|
| 139 |
[pretraining]
|
| 140 |
|
|
|
|
| 17 |
after_creation = null
|
| 18 |
after_pipeline_creation = null
|
| 19 |
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
| 20 |
+
batch_size = 200
|
| 21 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 22 |
|
| 23 |
[components]
|
|
|
|
| 41 |
|
| 42 |
[components.relation_extractor.model.create_instance_tensor.get_instances]
|
| 43 |
@misc = "rel_instance_generator.v1"
|
| 44 |
+
max_length = 100
|
| 45 |
|
| 46 |
[components.relation_extractor.model.create_instance_tensor.tok2vec]
|
| 47 |
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
|
|
| 51 |
|
| 52 |
[components.transformer]
|
| 53 |
factory = "transformer"
|
| 54 |
+
max_batch_items = 2096
|
| 55 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
| 56 |
|
| 57 |
[components.transformer.model]
|
|
|
|
| 88 |
accumulate_gradient = 1
|
| 89 |
patience = 1600000
|
| 90 |
max_epochs = 0
|
| 91 |
+
max_steps = 5000
|
| 92 |
+
eval_frequency = 50
|
| 93 |
frozen_components = []
|
| 94 |
dev_corpus = "corpora.dev"
|
| 95 |
train_corpus = "corpora.train"
|
|
|
|
| 127 |
[training.score_weights]
|
| 128 |
rel_micro_p = 0.0
|
| 129 |
rel_micro_r = 0.0
|
| 130 |
+
rel_micro_f = 0.1
|
| 131 |
+
rel_macro_f = 0.1
|
| 132 |
+
rel_weighted_f = 0.1
|
| 133 |
+
f1_PART-OF = 0.1
|
| 134 |
+
f1_LOCATED-AT = 0.1
|
| 135 |
+
f1_CONNECTED-WITH = 0.1
|
| 136 |
+
f1_IN-MANNER-OF = 0.1
|
| 137 |
+
f1_ATTRIBUTE-FOR = 0.1
|
| 138 |
+
f1_macro = 0.1
|
| 139 |
+
f1_weighted = 0.1
|
| 140 |
|
| 141 |
[pretraining]
|
| 142 |
|
meta.json
CHANGED
|
@@ -39,18 +39,20 @@
|
|
| 39 |
|
| 40 |
],
|
| 41 |
"performance":{
|
| 42 |
-
"rel_micro_p":0.
|
| 43 |
-
"rel_micro_r":0.
|
| 44 |
-
"rel_micro_f":0.
|
| 45 |
-
"rel_macro_f":0.
|
| 46 |
-
"rel_weighted_f":0.
|
| 47 |
"f1_PART-OF":0.3796196627,
|
| 48 |
-
"f1_LOCATED-AT":0.
|
| 49 |
-
"f1_CONNECTED-WITH":0.
|
| 50 |
"f1_IN-MANNER-OF":0.0,
|
| 51 |
-
"f1_ATTRIBUTE-FOR":0.
|
| 52 |
-
"
|
| 53 |
-
"
|
|
|
|
|
|
|
| 54 |
},
|
| 55 |
"requirements":[
|
| 56 |
"spacy-transformers>=1.3.8,<1.4.0",
|
|
|
|
| 39 |
|
| 40 |
],
|
| 41 |
"performance":{
|
| 42 |
+
"rel_micro_p":0.4690909091,
|
| 43 |
+
"rel_micro_r":0.1539746956,
|
| 44 |
+
"rel_micro_f":0.2318475917,
|
| 45 |
+
"rel_macro_f":0.1290549706,
|
| 46 |
+
"rel_weighted_f":0.2104325211,
|
| 47 |
"f1_PART-OF":0.3796196627,
|
| 48 |
+
"f1_LOCATED-AT":0.1286863271,
|
| 49 |
+
"f1_CONNECTED-WITH":0.0575296108,
|
| 50 |
"f1_IN-MANNER-OF":0.0,
|
| 51 |
+
"f1_ATTRIBUTE-FOR":0.0794392523,
|
| 52 |
+
"f1_macro":0.0,
|
| 53 |
+
"f1_weighted":0.0,
|
| 54 |
+
"transformer_loss":0.0290014347,
|
| 55 |
+
"relation_extractor_loss":1.322729256
|
| 56 |
},
|
| 57 |
"requirements":[
|
| 58 |
"spacy-transformers>=1.3.8,<1.4.0",
|
relationFactory.py
CHANGED
|
@@ -1,29 +1,34 @@
|
|
|
|
|
| 1 |
from typing import Tuple, List, Iterable, Optional, Dict, Callable, Any
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import spacy
|
| 4 |
from spacy.tokens import Doc, Span
|
| 5 |
from thinc.types import Floats2d, Ints1d, Ragged, cast
|
| 6 |
-
from thinc.api import Model, Linear, chain, Logistic
|
| 7 |
|
| 8 |
import json
|
| 9 |
import os
|
| 10 |
import time
|
| 11 |
-
from itertools import islice
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
from sklearn.metrics import precision_recall_fscore_support, f1_score
|
| 15 |
-
import numpy
|
| 16 |
-
from spacy.training.example import Example
|
| 17 |
-
from spacy.tokens.doc import Doc
|
| 18 |
-
from spacy.pipeline.trainable_pipe import TrainablePipe
|
| 19 |
-
from spacy.vocab import Vocab
|
| 20 |
-
from spacy import Language
|
| 21 |
-
from thinc.model import set_dropout_rate
|
| 22 |
-
from wasabi import Printer
|
| 23 |
import plotly.express as px
|
| 24 |
import plotly.graph_objects as go
|
| 25 |
|
| 26 |
-
|
| 27 |
@spacy.registry.architectures("rel_model.v1")
|
| 28 |
def create_relation_model(
|
| 29 |
create_instance_tensor: Model[List[Doc], Floats2d],
|
|
@@ -265,6 +270,17 @@ class RelationExtractor(TrainablePipe):
|
|
| 265 |
self.set_annotations(docs, predictions)
|
| 266 |
return losses
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
| 269 |
"""Find the loss and gradient of loss for the batch of documents and
|
| 270 |
their predicted scores."""
|
|
@@ -452,4 +468,3 @@ def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str,
|
|
| 452 |
result["rel_weighted_f"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
|
| 453 |
|
| 454 |
return result
|
| 455 |
-
|
|
|
|
| 1 |
+
from itertools import islice
|
| 2 |
from typing import Tuple, List, Iterable, Optional, Dict, Callable, Any
|
| 3 |
|
| 4 |
+
from spacy.scorer import PRFScore
|
| 5 |
+
from thinc.types import Floats2d
|
| 6 |
+
import numpy
|
| 7 |
+
from spacy.training.example import Example
|
| 8 |
+
from thinc.api import Model, Optimizer
|
| 9 |
+
from spacy.tokens.doc import Doc
|
| 10 |
+
from spacy.pipeline.trainable_pipe import TrainablePipe
|
| 11 |
+
from spacy.vocab import Vocab
|
| 12 |
+
from spacy import Language
|
| 13 |
+
from thinc.model import set_dropout_rate
|
| 14 |
+
from wasabi import Printer
|
| 15 |
+
|
| 16 |
+
from typing import List, Tuple, Callable
|
| 17 |
+
|
| 18 |
import spacy
|
| 19 |
from spacy.tokens import Doc, Span
|
| 20 |
from thinc.types import Floats2d, Ints1d, Ragged, cast
|
| 21 |
+
from thinc.api import Model, Linear, chain, Logistic
|
| 22 |
|
| 23 |
import json
|
| 24 |
import os
|
| 25 |
import time
|
|
|
|
| 26 |
from pathlib import Path
|
| 27 |
|
| 28 |
from sklearn.metrics import precision_recall_fscore_support, f1_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
import plotly.express as px
|
| 30 |
import plotly.graph_objects as go
|
| 31 |
|
|
|
|
| 32 |
@spacy.registry.architectures("rel_model.v1")
|
| 33 |
def create_relation_model(
|
| 34 |
create_instance_tensor: Model[List[Doc], Floats2d],
|
|
|
|
| 270 |
self.set_annotations(docs, predictions)
|
| 271 |
return losses
|
| 272 |
|
| 273 |
+
def get_focal_loss(self, examples: Iterable[Example], scores, gamma=3.0, alpha=0.25, eps=1e-8) -> Tuple[float, float]:
|
| 274 |
+
truths = self._examples_to_truth(examples)
|
| 275 |
+
scores_2 = numpy.clip(scores, eps, 1. - eps)
|
| 276 |
+
p_t = numpy.clip(scores_2 * truths + (1 - scores_2) * (1 - truths), eps, 1. - eps)
|
| 277 |
+
|
| 278 |
+
focal_loss = -(1 - p_t) ** gamma * numpy.log(p_t)
|
| 279 |
+
loss = numpy.mean(numpy.sum(focal_loss, axis=1))
|
| 280 |
+
gradient = focal_loss * (1 - 2 * truths)
|
| 281 |
+
return float(loss), gradient
|
| 282 |
+
|
| 283 |
+
|
| 284 |
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
| 285 |
"""Find the loss and gradient of loss for the batch of documents and
|
| 286 |
their predicted scores."""
|
|
|
|
| 468 |
result["rel_weighted_f"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
|
| 469 |
|
| 470 |
return result
|
|
|
relation_extractor/model
CHANGED
|
Binary files a/relation_extractor/model and b/relation_extractor/model differ
|
|
|
ru_patents_rel_tiny-any-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4054040e76b605f22e2513c94ab0a96dc601bdd49d0defdd85a3ace67f830aea
|
| 3 |
+
size 108770148
|
transformer/cfg
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
{
|
| 2 |
-
"max_batch_items":
|
| 3 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"max_batch_items":2096
|
| 3 |
}
|
transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35244535987be2a96005cdc63d93ae46ff5c0ab749f47435c212c65b94d176a2
|
| 3 |
+
size 120294214
|