Spaces:
Runtime error
Runtime error
Support other SentenceTransformer models as well and update the documentation accordingly
Browse files- README.md +11 -3
- encoder_models.py +19 -20
- semf1.py +12 -8
- tests.py +35 -24
README.md
CHANGED
|
@@ -53,6 +53,10 @@ Sem-F1 also accepts multiple optional arguments:
|
|
| 53 |
- `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
|
| 54 |
- `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
|
| 55 |
- `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
- `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
| 57 |
- `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
|
| 58 |
- `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
|
|
@@ -79,10 +83,14 @@ List of `Scores` dataclass corresponding to each sample -
|
|
| 79 |
- `f1: float`: F1 score (between precision and average recall).
|
| 80 |
|
| 81 |
|
| 82 |
-
##
|
| 83 |
Currently, we have only implemented the 3 encoders* that we experimented with in our
|
| 84 |
-
[paper](https://aclanthology.org/2022.emnlp-main.49/).
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
`*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
|
| 88 |
of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
|
|
|
|
| 53 |
- `pv1` - [paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)
|
| 54 |
- `stsb` - [stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)
|
| 55 |
- `use` - [Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) (Default)
|
| 56 |
+
|
| 57 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
|
| 58 |
+
such as `all-mpnet-base-v2` or `roberta-base`
|
| 59 |
+
|
| 60 |
- `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
| 61 |
- `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
|
| 62 |
- `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.
|
|
|
|
| 83 |
- `f1: float`: F1 score (between precision and average recall).
|
| 84 |
|
| 85 |
|
| 86 |
+
## Extensions
|
| 87 |
Currently, we have only implemented the 3 encoders* that we experimented with in our
|
| 88 |
+
[paper](https://aclanthology.org/2022.emnlp-main.49/). Furthermore, you can use any model on
|
| 89 |
+
Huggingface/SentenceTransformer that is supported by SentenceTransformer such as `all-mpnet-base-v2` or `roberta-base`.
|
| 90 |
+
|
| 91 |
+
If you want to use your own encoder model, either make sure that is supported by `SentenceTransformer`. Or if it's a
|
| 92 |
+
completely new architecture, it can easily with extended for more models by extending the `Encoder` base class (Refer to
|
| 93 |
+
`encoder_models.py` file).
|
| 94 |
|
| 95 |
`*` *In out paper, we used the Tensorflow [version](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
|
| 96 |
of the USE model, however, in our current implementation, we used [PyTorch version](https://huggingface.co/sentence-transformers/use-cmlm-multilingual).*
|
encoder_models.py
CHANGED
|
@@ -25,14 +25,6 @@ class Encoder(abc.ABC):
|
|
| 25 |
raise NotImplementedError("Method 'encode' must be implemented in subclass.")
|
| 26 |
|
| 27 |
|
| 28 |
-
class USE(Encoder):
|
| 29 |
-
def __init__(self):
|
| 30 |
-
pass
|
| 31 |
-
|
| 32 |
-
def encode(self, prediction: List[str]) -> NDArray:
|
| 33 |
-
pass
|
| 34 |
-
|
| 35 |
-
|
| 36 |
class SBertEncoder(Encoder):
|
| 37 |
def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
|
| 38 |
"""
|
|
@@ -44,7 +36,7 @@ class SBertEncoder(Encoder):
|
|
| 44 |
batch_size (int): Batch size for encoding.
|
| 45 |
verbose (bool): Whether to print verbose information during encoding.
|
| 46 |
"""
|
| 47 |
-
self.model = SentenceTransformer(model_name)
|
| 48 |
self.device = device
|
| 49 |
self.batch_size = batch_size
|
| 50 |
self.verbose = verbose
|
|
@@ -84,10 +76,13 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
|
|
| 84 |
|
| 85 |
Args:
|
| 86 |
model_name (str): Name of the model to instantiate
|
| 87 |
-
Options:
|
| 88 |
-
|
| 89 |
-
stsb
|
| 90 |
-
use
|
|
|
|
|
|
|
|
|
|
| 91 |
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
|
| 92 |
(e.g., "cuda", 0 for GPU, "cpu").
|
| 93 |
batch_size (int): Batch size for encoding.
|
|
@@ -97,12 +92,16 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
|
|
| 97 |
Encoder: Instance of the selected encoder based on the model_name.
|
| 98 |
|
| 99 |
Raises:
|
| 100 |
-
|
| 101 |
"""
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
raise NotImplementedError("Method 'encode' must be implemented in subclass.")
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
class SBertEncoder(Encoder):
|
| 29 |
def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
|
| 30 |
"""
|
|
|
|
| 36 |
batch_size (int): Batch size for encoding.
|
| 37 |
verbose (bool): Whether to print verbose information during encoding.
|
| 38 |
"""
|
| 39 |
+
self.model = SentenceTransformer(model_name, trust_remote_code=True)
|
| 40 |
self.device = device
|
| 41 |
self.batch_size = batch_size
|
| 42 |
self.verbose = verbose
|
|
|
|
| 76 |
|
| 77 |
Args:
|
| 78 |
model_name (str): Name of the model to instantiate
|
| 79 |
+
Options:
|
| 80 |
+
paraphrase-distilroberta-base-v1,
|
| 81 |
+
stsb-roberta-large,
|
| 82 |
+
sentence-transformers/use-cmlm-multilingual
|
| 83 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
|
| 84 |
+
SentenceTransformer.
|
| 85 |
+
|
| 86 |
device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
|
| 87 |
(e.g., "cuda", 0 for GPU, "cpu").
|
| 88 |
batch_size (int): Batch size for encoding.
|
|
|
|
| 92 |
Encoder: Instance of the selected encoder based on the model_name.
|
| 93 |
|
| 94 |
Raises:
|
| 95 |
+
EnvironmentError/RuntimeError: If an unsupported model_name is provided.
|
| 96 |
"""
|
| 97 |
|
| 98 |
+
try:
|
| 99 |
+
encoder = SBertEncoder(model_name, device, batch_size, verbose)
|
| 100 |
+
except EnvironmentError as err:
|
| 101 |
+
raise EnvironmentError(str(err)) from None
|
| 102 |
+
except Exception as err:
|
| 103 |
+
raise RuntimeError(str(err)) from None
|
| 104 |
+
|
| 105 |
+
return encoder
|
| 106 |
+
|
| 107 |
+
|
semf1.py
CHANGED
|
@@ -62,9 +62,12 @@ Args:
|
|
| 62 |
predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
| 63 |
references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
| 64 |
model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
|
| 65 |
-
pv1 - paraphrase-distilroberta-base-v1
|
| 66 |
stsb - stsb-roberta-large
|
| 67 |
-
use - Universal Sentence Encoder
|
|
|
|
|
|
|
|
|
|
| 68 |
tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
| 69 |
multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
|
| 70 |
gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
|
|
@@ -241,7 +244,7 @@ class SemF1(evaluate.Metric):
|
|
| 241 |
_MODEL_TYPE_TO_NAME = {
|
| 242 |
"pv1": "paraphrase-distilroberta-base-v1",
|
| 243 |
"stsb": "stsb-roberta-large",
|
| 244 |
-
"use": "
|
| 245 |
}
|
| 246 |
|
| 247 |
def _info(self):
|
|
@@ -304,9 +307,7 @@ class SemF1(evaluate.Metric):
|
|
| 304 |
model_type = "use"
|
| 305 |
|
| 306 |
if model_type not in self._MODEL_TYPE_TO_NAME.keys():
|
| 307 |
-
|
| 308 |
-
f"Options: {self._MODEL_TYPE_TO_NAME.keys()}\n"
|
| 309 |
-
f"Currently provided: {model_type}")
|
| 310 |
|
| 311 |
return self._MODEL_TYPE_TO_NAME[model_type]
|
| 312 |
|
|
@@ -335,9 +336,12 @@ class SemF1(evaluate.Metric):
|
|
| 335 |
:param references
|
| 336 |
:param model_type: Type of model to use for encoding.
|
| 337 |
Options: [pv1, stsb, use]
|
| 338 |
-
pv1 - paraphrase-distilroberta-base-v1
|
| 339 |
stsb - stsb-roberta-large
|
| 340 |
-
use - Universal Sentence Encoder
|
|
|
|
|
|
|
|
|
|
| 341 |
:param tokenize_sentences: Flag to sentence tokenize the document.
|
| 342 |
:param multi_references: Flag to indicate multiple references.
|
| 343 |
:param gpu: GPU device to use.
|
|
|
|
| 62 |
predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
| 63 |
references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
|
| 64 |
model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
|
| 65 |
+
pv1 - paraphrase-distilroberta-base-v1
|
| 66 |
stsb - stsb-roberta-large
|
| 67 |
+
use - Universal Sentence Encoder (Default)
|
| 68 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer such
|
| 69 |
+
as `all-mpnet-base-v2` or `roberta-base`
|
| 70 |
+
|
| 71 |
tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
|
| 72 |
multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
|
| 73 |
gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
|
|
|
|
| 244 |
_MODEL_TYPE_TO_NAME = {
|
| 245 |
"pv1": "paraphrase-distilroberta-base-v1",
|
| 246 |
"stsb": "stsb-roberta-large",
|
| 247 |
+
"use": "sentence-transformers/use-cmlm-multilingual",
|
| 248 |
}
|
| 249 |
|
| 250 |
def _info(self):
|
|
|
|
| 307 |
model_type = "use"
|
| 308 |
|
| 309 |
if model_type not in self._MODEL_TYPE_TO_NAME.keys():
|
| 310 |
+
return model_type
|
|
|
|
|
|
|
| 311 |
|
| 312 |
return self._MODEL_TYPE_TO_NAME[model_type]
|
| 313 |
|
|
|
|
| 336 |
:param references
|
| 337 |
:param model_type: Type of model to use for encoding.
|
| 338 |
Options: [pv1, stsb, use]
|
| 339 |
+
pv1 - paraphrase-distilroberta-base-v1
|
| 340 |
stsb - stsb-roberta-large
|
| 341 |
+
use - Universal Sentence Encoder (Default)
|
| 342 |
+
Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
|
| 343 |
+
SentenceTransformer.
|
| 344 |
+
|
| 345 |
:param tokenize_sentences: Flag to sentence tokenize the document.
|
| 346 |
:param multi_references: Flag to indicate multiple references.
|
| 347 |
:param gpu: GPU device to use.
|
tests.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import statistics
|
| 2 |
import unittest
|
|
|
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
|
@@ -153,32 +154,42 @@ class TestSBertEncoder(unittest.TestCase):
|
|
| 153 |
|
| 154 |
|
| 155 |
class TestGetEncoder(unittest.TestCase):
|
| 156 |
-
def
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
verbose = False
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
| 163 |
self.assertIsInstance(encoder, SBertEncoder)
|
| 164 |
-
self.assertEqual(encoder.device, device)
|
| 165 |
-
self.assertEqual(encoder.batch_size, batch_size)
|
| 166 |
-
self.assertEqual(encoder.verbose, verbose)
|
| 167 |
-
|
| 168 |
-
def
|
| 169 |
-
model_name = "
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
class TestSemF1(unittest.TestCase):
|
|
|
|
| 1 |
import statistics
|
| 2 |
import unittest
|
| 3 |
+
from unittest.mock import patch, MagicMock
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
class TestGetEncoder(unittest.TestCase):
|
| 157 |
+
def setUp(self):
|
| 158 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 159 |
+
self.batch_size = 8
|
| 160 |
+
self.verbose = False
|
|
|
|
| 161 |
|
| 162 |
+
def _base_test(self, model_name):
|
| 163 |
+
encoder = get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
| 164 |
+
|
| 165 |
+
# Assert
|
| 166 |
self.assertIsInstance(encoder, SBertEncoder)
|
| 167 |
+
self.assertEqual(encoder.device, self.device)
|
| 168 |
+
self.assertEqual(encoder.batch_size, self.batch_size)
|
| 169 |
+
self.assertEqual(encoder.verbose, self.verbose)
|
| 170 |
+
|
| 171 |
+
def test_get_sbert_encoder(self):
|
| 172 |
+
model_name = "stsb-roberta-large"
|
| 173 |
+
self._base_test(model_name)
|
| 174 |
+
|
| 175 |
+
def test_sbert_model(self):
|
| 176 |
+
model_name = "all-mpnet-base-v2"
|
| 177 |
+
self._base_test(model_name)
|
| 178 |
+
|
| 179 |
+
def test_huggingface_model(self):
|
| 180 |
+
"""Test Huggingface models which work with SBert library"""
|
| 181 |
+
model_name = "roberta-base"
|
| 182 |
+
self._base_test(model_name)
|
| 183 |
+
|
| 184 |
+
def test_get_encoder_environment_error(self): # This parameter is used when using patch decorator
|
| 185 |
+
model_name = "abc" # Wrong model_name
|
| 186 |
+
with self.assertRaises(EnvironmentError):
|
| 187 |
+
get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
| 188 |
+
|
| 189 |
+
def test_get_encoder_other_exception(self):
|
| 190 |
+
model_name = "apple/OpenELM-270M" # This model is not supported by SentenceTransformer lib
|
| 191 |
+
with self.assertRaises(RuntimeError):
|
| 192 |
+
get_encoder(model_name, self.device, self.batch_size, self.verbose)
|
| 193 |
|
| 194 |
|
| 195 |
class TestSemF1(unittest.TestCase):
|