Spaces:

BridgeAI-Lab
/

SemF1

Runtime error

App Files Files Community

nbansal commited on May 5, 2025

Commit

47cf512

1 Parent(s): 9db3d74

refactor: allow custom Encoder instances

Browse files

Files changed (4) hide show

README.md +2 -2
encoder_models.py +48 -40
semf1.py +157 -108
tests.py +135 -76

README.md CHANGED Viewed

@@ -59,8 +59,8 @@ Sem-F1 takes 2 mandatory arguments:
 Sem-F1 also accepts multiple optional arguments:
-- `model_type (str)`: Model to use for encoding sentences. Options: ['pv1' ([paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)), 'stsb' ([stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)), 'use' ([Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual)) (Default)]. Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
-  such as `all-mpnet-base-v2` or `roberta-base`.
 - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
 - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
 - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.

 Sem-F1 also accepts multiple optional arguments:
+- `model_type (Optional[Union[str, Encoder]])`: Model to use for encoding sentences. Options: ['pv1' ([paraphrase-distilroberta-base-v1](https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1)), 'stsb' ([stsb-roberta-large](https://huggingface.co/sentence-transformers/stsb-roberta-large)), 'use' ([Universal Sentence Encoder](https://huggingface.co/sentence-transformers/use-cmlm-multilingual)) (Default)]. Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer
+  such as `all-mpnet-base-v2` or `roberta-base`. Users can also pass a custom `Encoder` which must implement the `encode` method. Refer SemF1/encoder_models.py
 - `tokenize_sentences (bool)`: Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
 - `multi_references (bool)`: Flag to indicate whether multiple references are provided. Default: False.
 - `gpu (Union[bool, str, int, List[Union[str, int]]])`: Whether to use GPU, CPU or multiple-processes for computation.

encoder_models.py CHANGED Viewed

@@ -9,68 +9,83 @@ from .type_aliases import ENCODER_DEVICE_TYPE
 class Encoder(abc.ABC):
     @abc.abstractmethod
-    def encode(self, prediction: List[str]) -> NDArray:
         """
-            Abstract method to encode a list of sentences into sentence embeddings.
-            Args:
-                prediction (List[str]): List of sentences to encode.
-            Returns:
-                NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
-            Raises:
-                NotImplementedError: If the method is not implemented in the subclass.
         """
         raise NotImplementedError("Method 'encode' must be implemented in subclass.")
 class SBertEncoder(Encoder):
-    def __init__(self, model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool):
         """
-            Initialize SBertEncoder instance.
-            Args:
-                model_name (str): Name or path of the Sentence Transformer model.
-                device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
-                batch_size (int): Batch size for encoding.
-                verbose (bool): Whether to print verbose information during encoding.
         """
         self.model = SentenceTransformer(model_name, trust_remote_code=True)
-        self.device = device
-        self.batch_size = batch_size
-        self.verbose = verbose
-    def encode(self, prediction: List[str]) -> NDArray:
         """
-           Encode a list of sentences into sentence embeddings.
-           Args:
-               prediction (List[str]): List of sentences to encode.
-           Returns:
-               NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
         """
         # SBert output is always Batch x Dim
-        if isinstance(self.device, list):
             # Use multiprocess encoding for list of devices
-            pool = self.model.start_multi_process_pool(target_devices=self.device)
-            embeddings = self.model.encode_multi_process(prediction, pool=pool, batch_size=self.batch_size)
             self.model.stop_multi_process_pool(pool)
         else:
             # Single device encoding
             embeddings = self.model.encode(
                 prediction,
-                device=self.device,
-                batch_size=self.batch_size,
-                show_progress_bar=self.verbose,
             )
         return embeddings
-def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, verbose: bool) -> Encoder:
     """
     Get the encoder instance based on the specified model name.
@@ -83,11 +98,6 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
             Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
             SentenceTransformer.
-        device (Union[str, int, List[Union[str, int]]): Device specification for the encoder
-            (e.g., "cuda", 0 for GPU, "cpu").
-        batch_size (int): Batch size for encoding.
-        verbose (bool): Whether to print verbose information during encoder initialization.
     Returns:
         Encoder: Instance of the selected encoder based on the model_name.
@@ -96,12 +106,10 @@ def get_encoder(model_name: str, device: ENCODER_DEVICE_TYPE, batch_size: int, v
     """
     try:
-        encoder = SBertEncoder(model_name, device, batch_size, verbose)
     except EnvironmentError as err:
         raise EnvironmentError(str(err)) from None
     except Exception as err:
         raise RuntimeError(str(err)) from None
     return encoder

 class Encoder(abc.ABC):
     @abc.abstractmethod
+    def encode(
+        self,
+        prediction: List[str],
+        *,
+        device: ENCODER_DEVICE_TYPE = "cpu",
+        batch_size: int = 32,
+        verbose: bool = False,
+    ) -> NDArray:
         """
+        Abstract method to encode a list of sentences into sentence embeddings.
+        Args:
+            prediction (List[str]): List of sentences to encode.
+            device (Union[str, int, List[Union[str, int]]]): Device specification for encoding.
+            batch_size (int): Batch size for encoding.
+            verbose (bool): Whether to print verbose information during encoding.
+        Returns:
+            NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
+        Raises:
+            NotImplementedError: If the method is not implemented in the subclass.
         """
         raise NotImplementedError("Method 'encode' must be implemented in subclass.")
 class SBertEncoder(Encoder):
+    def __init__(self, model_name: str):
         """
+        Initialize SBertEncoder instance.
+        Args:
+            model_name (str): Name or path of the Sentence Transformer model.
         """
         self.model = SentenceTransformer(model_name, trust_remote_code=True)
+    def encode(
+        self,
+        prediction: List[str],
+        *,
+        device: ENCODER_DEVICE_TYPE = "cpu",
+        batch_size: int = 32,
+        verbose: bool = False,
+    ) -> NDArray:
         """
+        Encode a list of sentences into sentence embeddings.
+        Args:
+            prediction (List[str]): List of sentences to encode.
+            device (Union[str, int, List[Union[str, int]]]): Device specification for encoding
+            batch_size (int): Batch size for encoding.
+            verbose (bool): Whether to print verbose information during encoding.
+        Returns:
+            NDArray: Array of sentence embeddings with shape (num_sentences, embedding_dim).
         """
         # SBert output is always Batch x Dim
+        if isinstance(device, list):
             # Use multiprocess encoding for list of devices
+            pool = self.model.start_multi_process_pool(target_devices=device)
+            embeddings = self.model.encode_multi_process(
+                prediction, pool=pool, batch_size=batch_size
+            )
             self.model.stop_multi_process_pool(pool)
         else:
             # Single device encoding
             embeddings = self.model.encode(
                 prediction,
+                device=device,
+                batch_size=batch_size,
+                show_progress_bar=verbose,
             )
         return embeddings
+def get_encoder(model_name: str) -> Encoder:
     """
     Get the encoder instance based on the specified model name.
             Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
             SentenceTransformer.
     Returns:
         Encoder: Instance of the selected encoder based on the model_name.
     """
     try:
+        encoder = SBertEncoder(model_name)  # , device, batch_size, verbose)
     except EnvironmentError as err:
         raise EnvironmentError(str(err)) from None
     except Exception as err:
         raise RuntimeError(str(err)) from None
     return encoder

semf1.py CHANGED Viewed

@@ -16,7 +16,7 @@ Sem-F1 metric
 Author: Naman Bansal
 """
-from typing import List, Optional, Tuple
 import datasets
 import evaluate
@@ -25,9 +25,16 @@ import numpy as np
 from numpy.typing import NDArray
 from sklearn.metrics.pairwise import cosine_similarity
-from .encoder_models import get_encoder
 from .type_aliases import DEVICE_TYPE, PREDICTION_TYPE, REFERENCE_TYPE
-from .utils import is_nested_list_of_type, Scores, slice_embeddings, flatten_list, get_gpu, sent_tokenize
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
@@ -63,13 +70,15 @@ using precision, recall, and F1 score based on sentence embeddings.
 Args:
     predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
     references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
-    model_type (str): Model to use for encoding sentences. Options: ['pv1', 'stsb', 'use']
-        pv1 - paraphrase-distilroberta-base-v1
-        stsb - stsb-roberta-large
-        use - Universal Sentence Encoder (Default)
-    Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by SentenceTransformer such
-    as `all-mpnet-base-v2` or `roberta-base`
     tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
     multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
     gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
@@ -151,19 +160,21 @@ Examples:
 """
-def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tuple[float, float]:
     """
-        Compute precision and recall based on cosine similarity between predicted and reference embeddings.
-        Args:
-            pred_embeds (NDArray): Predicted embeddings (shape: [num_pred, embedding_dim]).
-            ref_embeds (NDArray): Reference embeddings (shape: [num_ref, embedding_dim]).
-        Returns:
-            Tuple[float, float]: Precision and recall based on cosine similarity scores.
-                Precision: Average maximum cosine similarity score per predicted embedding.
-                Recall: Average maximum cosine similarity score per reference embedding.
-        """
     # Compute cosine similarity between predicted and reference embeddings
     cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
@@ -181,60 +192,65 @@ def _compute_cosine_similarity(pred_embeds: NDArray, ref_embeds: NDArray) -> Tup
 def _validate_input_format(
-        tokenize_sentences: bool,
-        multi_references: bool,
-        predictions: PREDICTION_TYPE,
-        references: REFERENCE_TYPE,
 ):
     """
-        Validate the format of predictions and references based on specified criteria.
-        Args:
-        - tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
-        - multi_references (bool): Flag indicating whether multiple references are provided.
-        - predictions (PREDICTION_TYPE): Predictions to validate.
-        - references (REFERENCE_TYPE): References to validate.
-        Raises:
-        - ValueError: If the format of predictions or references does not meet the specified criteria.
-        Validation Criteria:
-        The function validates predictions and references based on the following conditions:
-        1. If `tokenize_sentences` is True and `multi_references` is True:
-           - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
-           - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
-        2. If `tokenize_sentences` is False and `multi_references` is True:
-           - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
-           - References must be a list of list of list of strings (`is_list_of_strings_at_depth(references, 3)`).
-        3. If `tokenize_sentences` is True and `multi_references` is False:
-           - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
-           - References must be a list of strings (`is_list_of_strings_at_depth(references, 1)`).
-        4. If `tokenize_sentences` is False and `multi_references` is False:
-           - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
-           - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
-        The function checks these conditions and raises a ValueError if any condition is not met,
-        indicating that predictions or references are not in the valid input format.
-        Note:
-        - `PREDICTION_TYPE` and `REFERENCE_TYPE` are defined at the top of the file
     """
     if len(predictions) != len(references):
-        raise ValueError(f"Predictions and references must have the same length. "
-                         f"Got {len(predictions)} predictions and {len(references)} references.")
     if len(predictions) == 0:
         raise ValueError("Can't have empty inputs")
     def check_format(lst_obj, expected_depth: int, name: str):
-        is_valid, error_message = is_nested_list_of_type(lst_obj, element_type=str, depth=expected_depth)
         if not is_valid:
-            raise ValueError(f"{name} are not in the expected format.\n"
-                             f"Error: {error_message}.")
     try:
         if tokenize_sentences and multi_references:
@@ -274,9 +290,13 @@ class SemF1(evaluate.Metric):
                 datasets.Features(
                     {
                         # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
-                        "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
                         # references: List[List[str]] - List of references where each reference is a list of sentences
-                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
                     }
                 ),
                 # F1: Multi References: False, Tokenize_Sentences = True
@@ -292,12 +312,18 @@ class SemF1(evaluate.Metric):
                 datasets.Features(
                     {
                         # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
-                        "predictions": datasets.Sequence(datasets.Value("string", id="sequence"), id="predictions"),
                         # references: List[List[List[str]]] - List of multi-references.
                         #                                     So each "reference" is also a list (r1, r2, ...).
                         #                                     Further, each ri's are also list of sentences.
                         "references": datasets.Sequence(
-                            datasets.Sequence(datasets.Value("string", id="sequence"), id="ref"), id="references"),
                     }
                 ),
                 # F3: Multi References: True, Tokenize_Sentences = True
@@ -307,13 +333,15 @@ class SemF1(evaluate.Metric):
                         "predictions": datasets.Value("string", id="sequence"),
                         # references: List[List[List[str]]] - List of multi-references.
                         #                                     So each "reference" is also a list (r1, r2, ...).
-                        "references": datasets.Sequence(datasets.Value("string", id="ref"), id="references"),
                     }
                 ),
             ],
             # # Homepage of the module for documentation
             # Additional links to the codebase or references
-            reference_urls=["https://aclanthology.org/2022.emnlp-main.49/"]
         )
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
@@ -328,51 +356,62 @@ class SemF1(evaluate.Metric):
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
         import nltk
         nltk.download("punkt", quiet=True)
     def _compute(
-            self,
-            predictions,
-            references,
-            model_type: Optional[str] = None,
-            tokenize_sentences: bool = True,
-            multi_references: bool = False,
-            gpu: DEVICE_TYPE = False,
-            batch_size: int = 32,
-            verbose: bool = False,
-            aggregate: bool = False,
     ) -> List[Scores]:
         """
-            Compute precision, recall, and F1 scores for given predictions and references.
-            :param predictions
-            :param references
-            :param model_type: Type of model to use for encoding.
-                Options: [pv1, stsb, use]
-                    pv1 - paraphrase-distilroberta-base-v1
-                    stsb - stsb-roberta-large
-                    use - Universal Sentence Encoder (Default)
-                Furthermore, you can use any model on Huggingface/SentenceTransformer that is supported by
-                SentenceTransformer.
-            :param tokenize_sentences: Flag to sentence tokenize the document.
-            :param multi_references: Flag to indicate multiple references.
-            :param gpu: GPU device to use.
-            :param batch_size: Batch size for encoding.
-            :param verbose: Flag to indicate verbose output.
-            :param aggregate: Flag to determine if output should be averaged
-            :return: List of Scores dataclass with precision, recall, and F1 scores.
         """
         # Note: I have to specifically handle this case because the library considers the feature corresponding to
         #  this case (F2) as the feature for the other case (F0) i.e. it can't make any distinction between
         #  List[str] and List[List[str]]
         if not tokenize_sentences and multi_references:
-            references = [[eval(ref) for ref in mul_ref_ex] for mul_ref_ex in references]
         # Validate inputs corresponding to flags
-        _validate_input_format(tokenize_sentences, multi_references, predictions, references)
         # Get GPU
         device = get_gpu(gpu)
@@ -380,8 +419,15 @@ class SemF1(evaluate.Metric):
             print(f"Using devices: {device}")
         # Get the encoder model
-        model_name = self._get_model_name(model_type)
-        encoder = get_encoder(model_name, device=device, batch_size=batch_size, verbose=verbose)
         # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
         if not multi_references:
@@ -401,11 +447,15 @@ class SemF1(evaluate.Metric):
         # Note: This is the most optimal way of doing it
         # Encode all sentences in one go
-        embeddings = encoder.encode(all_sentences)
         # Get embeddings corresponding to predictions and references
         pred_embeddings = slice_embeddings(embeddings, prediction_sentences_count)
-        ref_embeddings = slice_embeddings(embeddings[sum(prediction_sentences_count):], reference_sentences_count)
         # Init output scores
         results = []
@@ -418,23 +468,22 @@ class SemF1(evaluate.Metric):
             precision = np.clip(precision, a_min=0.0, a_max=1.0).item()
             # Recall: Compute individually for each reference
-            recall_scores = [_compute_cosine_similarity(r_embeds, preds) for r_embeds in refs]
-            recall_scores = [np.clip(r_scores, 0.0, 1.0).item() for (r_scores, _) in recall_scores]
             results.append(Scores(precision, recall_scores))
         # run aggregation procedure
         if aggregate:
-            mean_prec = np.mean(
-                [score.precision for score in results]
-            )
-            mean_recall = np.mean(np.concatenate(
-                [np.array(score.recall) for score in results]
-            ))
-            aggregated_score = Scores(
-                float(mean_prec),
-                [float(mean_recall)]
             )
             results = aggregated_score
         return results

 Author: Naman Bansal
 """
+from typing import List, Optional, Tuple, Union
 import datasets
 import evaluate
 from numpy.typing import NDArray
 from sklearn.metrics.pairwise import cosine_similarity
+from .encoder_models import get_encoder, Encoder
 from .type_aliases import DEVICE_TYPE, PREDICTION_TYPE, REFERENCE_TYPE
+from .utils import (
+    is_nested_list_of_type,
+    Scores,
+    slice_embeddings,
+    flatten_list,
+    get_gpu,
+    sent_tokenize,
+)
 _CITATION = """\
 @inproceedings{bansal-etal-2022-sem,
 Args:
     predictions (list): List of predictions. Format varies based on `tokenize_sentences` and `multi_references` flags.
     references (list): List of references. Format varies based on `tokenize_sentences` and `multi_references` flags.
+    model_type (Optional[Union[str, Encoder]]): Model to use for encoding sentences.
+        Options: ['pv1', 'stsb', 'use']
+            pv1 - paraphrase-distilroberta-base-v1
+            stsb - stsb-roberta-large
+            use - Universal Sentence Encoder (Default)
+        - A string path or name for any model on Huggingface/SentenceTransformer that is supported by
+            SentenceTransformer such as `all-mpnet-base-v2` or `roberta-base` .
+        - A custom instance of an Encoder (must implement the encode() method). Refer SemF1/encoder_models.py
     tokenize_sentences (bool): Flag to indicate whether to tokenize the sentences in the input documents. Default: True.
     multi_references (bool): Flag to indicate whether multiple references are provided. Default is False.
     gpu (Union[bool, str, int, List[Union[str, int]]]): Whether to use GPU or CPU for computation.
 """
+def _compute_cosine_similarity(
+    pred_embeds: NDArray, ref_embeds: NDArray
+) -> Tuple[float, float]:
     """
+    Compute precision and recall based on cosine similarity between predicted and reference embeddings.
+    Args:
+        pred_embeds (NDArray): Predicted embeddings (shape: [num_pred, embedding_dim]).
+        ref_embeds (NDArray): Reference embeddings (shape: [num_ref, embedding_dim]).
+    Returns:
+        Tuple[float, float]: Precision and recall based on cosine similarity scores.
+            Precision: Average maximum cosine similarity score per predicted embedding.
+            Recall: Average maximum cosine similarity score per reference embedding.
+    """
     # Compute cosine similarity between predicted and reference embeddings
     cosine_scores = cosine_similarity(pred_embeds, ref_embeds)
 def _validate_input_format(
+    tokenize_sentences: bool,
+    multi_references: bool,
+    predictions: PREDICTION_TYPE,
+    references: REFERENCE_TYPE,
 ):
     """
+    Validate the format of predictions and references based on specified criteria.
+    Args:
+    - tokenize_sentences (bool): Flag indicating whether sentences should be tokenized.
+    - multi_references (bool): Flag indicating whether multiple references are provided.
+    - predictions (PREDICTION_TYPE): Predictions to validate.
+    - references (REFERENCE_TYPE): References to validate.
+    Raises:
+    - ValueError: If the format of predictions or references does not meet the specified criteria.
+    Validation Criteria:
+    The function validates predictions and references based on the following conditions:
+    1. If `tokenize_sentences` is True and `multi_references` is True:
+       - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
+       - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
+    2. If `tokenize_sentences` is False and `multi_references` is True:
+       - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
+       - References must be a list of list of list of strings (`is_list_of_strings_at_depth(references, 3)`).
+    3. If `tokenize_sentences` is True and `multi_references` is False:
+       - Predictions must be a list of strings (`is_list_of_strings_at_depth(predictions, 1)`).
+       - References must be a list of strings (`is_list_of_strings_at_depth(references, 1)`).
+    4. If `tokenize_sentences` is False and `multi_references` is False:
+       - Predictions must be a list of list of strings (`is_list_of_strings_at_depth(predictions, 2)`).
+       - References must be a list of list of strings (`is_list_of_strings_at_depth(references, 2)`).
+    The function checks these conditions and raises a ValueError if any condition is not met,
+    indicating that predictions or references are not in the valid input format.
+    Note:
+    - `PREDICTION_TYPE` and `REFERENCE_TYPE` are defined at the top of the file
     """
     if len(predictions) != len(references):
+        raise ValueError(
+            f"Predictions and references must have the same length. "
+            f"Got {len(predictions)} predictions and {len(references)} references."
+        )
     if len(predictions) == 0:
         raise ValueError("Can't have empty inputs")
     def check_format(lst_obj, expected_depth: int, name: str):
+        is_valid, error_message = is_nested_list_of_type(
+            lst_obj, element_type=str, depth=expected_depth
+        )
         if not is_valid:
+            raise ValueError(
+                f"{name} are not in the expected format.\n" f"Error: {error_message}."
+            )
     try:
         if tokenize_sentences and multi_references:
                 datasets.Features(
                     {
                         # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
+                        "predictions": datasets.Sequence(
+                            datasets.Value("string", id="sequence"), id="predictions"
+                        ),
                         # references: List[List[str]] - List of references where each reference is a list of sentences
+                        "references": datasets.Sequence(
+                            datasets.Value("string", id="sequence"), id="references"
+                        ),
                     }
                 ),
                 # F1: Multi References: False, Tokenize_Sentences = True
                 datasets.Features(
                     {
                         # predictions: List[List[str]] - List of predictions where prediction is a list of sentences
+                        "predictions": datasets.Sequence(
+                            datasets.Value("string", id="sequence"), id="predictions"
+                        ),
                         # references: List[List[List[str]]] - List of multi-references.
                         #                                     So each "reference" is also a list (r1, r2, ...).
                         #                                     Further, each ri's are also list of sentences.
                         "references": datasets.Sequence(
+                            datasets.Sequence(
+                                datasets.Value("string", id="sequence"), id="ref"
+                            ),
+                            id="references",
+                        ),
                     }
                 ),
                 # F3: Multi References: True, Tokenize_Sentences = True
                         "predictions": datasets.Value("string", id="sequence"),
                         # references: List[List[List[str]]] - List of multi-references.
                         #                                     So each "reference" is also a list (r1, r2, ...).
+                        "references": datasets.Sequence(
+                            datasets.Value("string", id="ref"), id="references"
+                        ),
                     }
                 ),
             ],
             # # Homepage of the module for documentation
             # Additional links to the codebase or references
+            reference_urls=["https://aclanthology.org/2022.emnlp-main.49/"],
         )
     def _get_model_name(self, model_type: Optional[str] = None) -> str:
     def _download_and_prepare(self, dl_manager):
         """Optional: download external resources useful to compute the scores"""
         import nltk
         nltk.download("punkt", quiet=True)
     def _compute(
+        self,
+        predictions,
+        references,
+        model_type: Optional[Union[str, Encoder]] = None,
+        tokenize_sentences: bool = True,
+        multi_references: bool = False,
+        gpu: DEVICE_TYPE = False,
+        batch_size: int = 32,
+        verbose: bool = False,
+        aggregate: bool = False,
     ) -> List[Scores]:
         """
+        Compute precision, recall, and F1 scores for given predictions and references.
+        Args:
+        - predictions
+        - references
+        - model_type: Type of model to use for encoding.
+            Options: [pv1, stsb, use]
+                pv1 - paraphrase-distilroberta-base-v1
+                stsb - stsb-roberta-large
+                use - Universal Sentence Encoder (Default)
+            - A string path or name for any model on Huggingface/SentenceTransformer that is supported by
+            SentenceTransformer.
+            - A custom instance of an Encoder (must implement the encode() method). Refer SemF1/encoder_models.py
+        - tokenize_sentences: Flag to sentence tokenize the document.
+        - multi_references: Flag to indicate multiple references.
+        - gpu: GPU device to use.
+        - batch_size: Batch size for encoding.
+        - verbose: Flag to indicate verbose output.
+        - aggregate: Flag to determine if output should be averaged
+        Returns:
+            Singleton/List of Scores dataclass with attributes as follows -
+                precision: float - precision score
+                recall: List[float] - List of recall scores corresponding to single/multiple references
+                f1: float - F1 score (between precision and average recall)
         """
         # Note: I have to specifically handle this case because the library considers the feature corresponding to
         #  this case (F2) as the feature for the other case (F0) i.e. it can't make any distinction between
         #  List[str] and List[List[str]]
         if not tokenize_sentences and multi_references:
+            references = [
+                [eval(ref) for ref in mul_ref_ex] for mul_ref_ex in references
+            ]
         # Validate inputs corresponding to flags
+        _validate_input_format(
+            tokenize_sentences, multi_references, predictions, references
+        )
         # Get GPU
         device = get_gpu(gpu)
             print(f"Using devices: {device}")
         # Get the encoder model
+        if model_type is None or isinstance(model_type, str):
+            model_name = self._get_model_name(model_type)
+            encoder = get_encoder(model_name)
+        elif isinstance(model_type, Encoder):
+            encoder = model_type
+        else:
+            raise TypeError(
+                f"Unsupported model_type: expected str or Encoder instance, got {type(model_type)}"
+            )
         # We'll handle the single reference and multi-reference case same way. So change the data format accordingly
         if not multi_references:
         # Note: This is the most optimal way of doing it
         # Encode all sentences in one go
+        embeddings = encoder.encode(
+            all_sentences, device=device, batch_size=batch_size, verbose=verbose
+        )
         # Get embeddings corresponding to predictions and references
         pred_embeddings = slice_embeddings(embeddings, prediction_sentences_count)
+        ref_embeddings = slice_embeddings(
+            embeddings[sum(prediction_sentences_count) :], reference_sentences_count
+        )
         # Init output scores
         results = []
             precision = np.clip(precision, a_min=0.0, a_max=1.0).item()
             # Recall: Compute individually for each reference
+            recall_scores = [
+                _compute_cosine_similarity(r_embeds, preds) for r_embeds in refs
+            ]
+            recall_scores = [
+                np.clip(r_scores, 0.0, 1.0).item() for (r_scores, _) in recall_scores
+            ]
             results.append(Scores(precision, recall_scores))
         # run aggregation procedure
         if aggregate:
+            mean_prec = np.mean([score.precision for score in results])
+            mean_recall = np.mean(
+                np.concatenate([np.array(score.recall) for score in results])
             )
+            aggregated_score = Scores(float(mean_prec), [float(mean_recall)])
             results = aggregated_score
         return results

tests.py CHANGED Viewed

@@ -10,7 +10,14 @@ from unittest import TestLoader
 from .encoder_models import SBertEncoder, get_encoder
 from .semf1 import SemF1, _compute_cosine_similarity, _validate_input_format
-from .utils import get_gpu, slice_embeddings, is_nested_list_of_type, flatten_list, compute_f1, Scores
 class TestUtils(unittest.TestCase):
@@ -40,20 +47,29 @@ class TestUtils(unittest.TestCase):
         self.assertEqual(get_gpu(1), 1 if gpu_available else "cpu")
         # Test list input with unique elements
-        self.assertEqual(get_gpu([True, "cpu", 0]), [0, "cpu"] if gpu_available else ["cpu", "cpu", "cpu"])
         # Test list input with duplicate elements
-        self.assertEqual(get_gpu([0, 0, "gpu"]), 0 if gpu_available else ["cpu", "cpu", "cpu"])
         # Test list input with duplicate elements of different types
-        self.assertEqual(get_gpu([True, 0, "gpu"]), 0 if gpu_available else ["cpu", "cpu", "cpu"])
         # Test list input but only one element
         self.assertEqual(get_gpu([True]), 0 if gpu_available else "cpu")
         # Test list input with all integers
-        self.assertEqual(get_gpu(list(range(gpu_count))),
-                         list(range(gpu_count)) if gpu_available else gpu_count * ["cpu"])
         with self.assertRaises(ValueError):
             get_gpu("invalid")
@@ -66,12 +82,19 @@ class TestUtils(unittest.TestCase):
         num_sentences = [3, 2, 5]
         expected_output = [embeddings[:3], embeddings[3:5], embeddings[5:]]
         self.assertTrue(
-            all(np.array_equal(a, b) for a, b in zip(slice_embeddings(embeddings, num_sentences),
-                                                     expected_output))
         )
         num_sentences_nested = [[2, 1], [3, 4]]
-        expected_output_nested = [[embeddings[:2], embeddings[2:3]], [embeddings[3:6], embeddings[6:]]]
         self.assertTrue(
             slice_embeddings(embeddings, num_sentences_nested), expected_output_nested
         )
@@ -88,7 +111,9 @@ class TestUtils(unittest.TestCase):
         self.assertEqual(is_valid, False)
         # Test case: Depth 1, list of elements matching element_type
-        self.assertEqual(is_nested_list_of_type(["apple", "banana"], str, 1), (True, ""))
         # Test case: Depth 1, list of elements not matching element_type
         is_valid, err_msg = is_nested_list_of_type([1, 2, 3], str, 1)
@@ -100,15 +125,18 @@ class TestUtils(unittest.TestCase):
         # Depth 2
         self.assertEqual(is_nested_list_of_type([[1, 2], [3, 4]], int, 2), (True, ""))
-        self.assertEqual(is_nested_list_of_type([['1', '2'], ['3', '4']], str, 2), (True, ""))
         is_valid, err_msg = is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2)
         self.assertEqual(is_valid, False)
         # Depth 3
         is_valid, err_msg = is_nested_list_of_type([[[1], [2]], [[3], [4]]], list, 3)
         self.assertEqual(is_valid, False)
-        self.assertEqual(is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3), (True, ""))
         # Test case: Depth is negative, expecting ValueError
         with self.assertRaises(ValueError):
@@ -134,38 +162,55 @@ class TestUtils(unittest.TestCase):
 class TestSBertEncoder(unittest.TestCase):
     def setUp(self, device=None):
         if device is None:
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             self.device = device
         self.model_name = "stsb-roberta-large"
         self.batch_size = 8
         self.verbose = False
-        self.encoder = SBertEncoder(self.model_name, self.device, self.batch_size, self.verbose)
     def test_initialization(self):
         self.assertIsInstance(self.encoder.model, SentenceTransformer)
-        self.assertEqual(self.encoder.device, self.device)
-        self.assertEqual(self.encoder.batch_size, self.batch_size)
-        self.assertEqual(self.encoder.verbose, self.verbose)
     def test_encode_single_device(self):
         sentences = ["This is a test sentence.", "Here is another sentence."]
-        embeddings = self.encoder.encode(sentences)
         self.assertIsInstance(embeddings, np.ndarray)
         self.assertEqual(embeddings.shape[0], len(sentences))
-        self.assertEqual(embeddings.shape[1], self.encoder.model.get_sentence_embedding_dimension())
     def test_encode_multi_device(self):
         if torch.cuda.device_count() < 2:
             self.skipTest("Multi-GPU test requires at least 2 GPUs.")
         else:
-            devices = ["cuda:0", "cuda:1"]
             self.setUp(devices)
-            sentences = ["This is a test sentence.", "Here is another sentence.", "This is a test sentence."]
-            embeddings = self.encoder.encode(sentences)
             self.assertIsInstance(embeddings, np.ndarray)
             self.assertEqual(embeddings.shape[0], 3)
-            self.assertEqual(embeddings.shape[1], self.encoder.model.get_sentence_embedding_dimension())
 class TestGetEncoder(unittest.TestCase):
@@ -175,13 +220,8 @@ class TestGetEncoder(unittest.TestCase):
         self.verbose = False
     def _base_test(self, model_name):
-        encoder = get_encoder(model_name, self.device, self.batch_size, self.verbose)
-        # Assert
         self.assertIsInstance(encoder, SBertEncoder)
-        self.assertEqual(encoder.device, self.device)
-        self.assertEqual(encoder.batch_size, self.batch_size)
-        self.assertEqual(encoder.verbose, self.verbose)
     def test_get_sbert_encoder(self):
         model_name = "stsb-roberta-large"
@@ -196,15 +236,15 @@ class TestGetEncoder(unittest.TestCase):
         model_name = "roberta-base"
         self._base_test(model_name)
-    def test_get_encoder_environment_error(self):  # This parameter is used when using patch decorator
         model_name = "abc"  # Wrong model_name
         with self.assertRaises(EnvironmentError):
-            get_encoder(model_name, self.device, self.batch_size, self.verbose)
     def test_get_encoder_other_exception(self):
         model_name = "apple/OpenELM-270M"  # This model is not supported by SentenceTransformer lib
         with self.assertRaises(RuntimeError):
-            get_encoder(model_name, self.device, self.batch_size, self.verbose)
 class TestSemF1(unittest.TestCase):
@@ -213,9 +253,11 @@ class TestSemF1(unittest.TestCase):
         # Example cases, #Samples = 1
         self.untokenized_single_reference_predictions = [
-            "This is a prediction sentence 1. This is a prediction sentence 2."]
         self.untokenized_single_reference_references = [
-            "This is a reference sentence 1. This is a reference sentence 2."]
         self.tokenized_single_reference_predictions = [
             ["This is a prediction sentence 1.", "This is a prediction sentence 2."],
@@ -228,7 +270,10 @@ class TestSemF1(unittest.TestCase):
             "Prediction sentence 1. Prediction sentence 2."
         ]
         self.untokenized_multi_reference_references = [
-            ["Reference sentence 1. Reference sentence 2.", "Alternative reference 1. Alternative reference 2."],
         ]
         self.tokenized_multi_reference_predictions = [
@@ -237,21 +282,21 @@ class TestSemF1(unittest.TestCase):
         self.tokenized_multi_reference_references = [
             [
                 ["Reference sentence 1.", "Reference sentence 2."],
-                ["Alternative reference 1.", "Alternative reference 2."]
             ],
         ]
         self.multi_sample_refs = [
-            'this is the first reference sample',
-            'this is the second reference sample',
         ]
         self.multi_sample_preds = [
-            'this is the first prediction sample',
-            'this is the second prediction sample',
         ]
     def test_aggregate_multi_sample(self):
         """
-        check if a `Scores` class is returned instead of a list of
         `Scores`
         """
         scores = self.semf1_metric.compute(
@@ -265,7 +310,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_aggregate_untokenized_single_ref(self):
         scores = self.semf1_metric.compute(
@@ -279,7 +324,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_aggregate_tokenized_single_ref(self):
         scores = self.semf1_metric.compute(
@@ -293,7 +338,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_aggregate_untokenized_multi_ref(self):
         scores = self.semf1_metric.compute(
@@ -307,7 +352,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_aggregate_tokenized_multi_ref(self):
         scores = self.semf1_metric.compute(
@@ -321,7 +366,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_aggregate_same_pred_and_ref(self):
         scores = self.semf1_metric.compute(
@@ -335,7 +380,7 @@ class TestSemF1(unittest.TestCase):
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
-        print(f'Score: {scores}')
     def test_untokenized_single_reference(self):
         scores = self.semf1_metric.compute(
@@ -345,10 +390,12 @@ class TestSemF1(unittest.TestCase):
             multi_references=False,
             gpu=False,
             batch_size=32,
-            verbose=False
         )
         self.assertIsInstance(scores, list)
-        self.assertEqual(len(scores), len(self.untokenized_single_reference_predictions))
     def test_tokenized_single_reference(self):
         scores = self.semf1_metric.compute(
@@ -358,7 +405,7 @@ class TestSemF1(unittest.TestCase):
             multi_references=False,
             gpu=False,
             batch_size=32,
-            verbose=False
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.tokenized_single_reference_predictions))
@@ -376,7 +423,7 @@ class TestSemF1(unittest.TestCase):
             multi_references=True,
             gpu=False,
             batch_size=32,
-            verbose=False
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.untokenized_multi_reference_predictions))
@@ -389,7 +436,7 @@ class TestSemF1(unittest.TestCase):
             multi_references=True,
             gpu=False,
             batch_size=32,
-            verbose=False
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.tokenized_multi_reference_predictions))
@@ -407,7 +454,7 @@ class TestSemF1(unittest.TestCase):
             multi_references=False,
             gpu=False,
             batch_size=32,
-            verbose=False
         )
         self.assertIsInstance(scores, list)
@@ -416,7 +463,12 @@ class TestSemF1(unittest.TestCase):
         for score in scores:
             self.assertIsInstance(score, Scores)
             self.assertAlmostEqual(score.precision, 1.0, places=6)
-            assert_almost_equal(score.recall, 1, decimal=5, err_msg="Not all values are almost equal to 1")
     def test_exact_output_scores(self):
         predictions = [
@@ -473,7 +525,9 @@ class TestSemF1(unittest.TestCase):
             ["I am", "I am"],
             [None, "I am"],
         ]
-        print(f"Case I\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n")
         # Case 2: tokenize_sentences = False, multi_references = True
         tokenize_sentences = False
@@ -486,7 +540,9 @@ class TestSemF1(unittest.TestCase):
             [["I am", "I am"], [None, "I am"]],
             [[None, "I am"]],
         ]
-        print(f"Case II\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n")
         # Case 3: tokenize_sentences = True, multi_references = False
         tokenize_sentences = True
@@ -499,7 +555,9 @@ class TestSemF1(unittest.TestCase):
             "I am. I am.",
             "I am. I am.",
         ]
-        print(f"Case III\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n")
         # Case 4: tokenize_sentences = False, multi_references = False
         # This is taken care by the library itself
@@ -513,7 +571,9 @@ class TestSemF1(unittest.TestCase):
             ["I am.", "I am."],
             ["I am.", "I am."],
         ]
-        print(f"Case IV\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n")
     def test_empty_input(self):
         predictions = ["", ""]
@@ -538,22 +598,16 @@ class TestCosineSimilarity(unittest.TestCase):
     def setUp(self):
         # Sample embeddings for testing
-        self.pred_embeds = np.array([
-            [1, 0, 0],
-            [0, 1, 0],
-            [0, 0, 1]
-        ])
-        self.ref_embeds = np.array([
-            [1, 0, 0],
-            [0, 1, 0],
-            [0, 0, 1]
-        ])
         self.pred_embeds_random = np.random.rand(3, 3)
         self.ref_embeds_random = np.random.rand(3, 3)
     def test_cosine_similarity_perfect_match(self):
-        precision, recall = _compute_cosine_similarity(self.pred_embeds, self.ref_embeds)
         # Expected values are 1.0 for both precision and recall since embeddings are identical
         self.assertAlmostEqual(precision, 1.0, places=5)
@@ -571,7 +625,9 @@ class TestCosineSimilarity(unittest.TestCase):
         self.assertAlmostEqual(recall, expected_recall, places=5)
     def test_cosine_similarity_random(self):
-        self._test_cosine_similarity_base(self.pred_embeds_random, self.ref_embeds_random)
     def test_cosine_similarity_different_shapes(self):
         pred_embeds_diff = np.random.rand(5, 3)
@@ -607,7 +663,7 @@ class TestValidateInputFormat(unittest.TestCase):
         self.untokenized_multi_reference_references = [
             [
                 "This is a reference sentence 1. This is a reference sentence 2.",
-                "Another reference sentence."
             ]
         ]
@@ -618,7 +674,7 @@ class TestValidateInputFormat(unittest.TestCase):
         self.tokenized_multi_reference_references = [
             [
                 ["This is a reference sentence 1.", "This is a reference sentence 2."],
-                ["Another reference sentence."]
             ]
         ]
@@ -701,7 +757,10 @@ class TestValidateInputFormat(unittest.TestCase):
                 True,
                 True,
                 self.untokenized_single_reference_predictions,
-                [self.untokenized_single_reference_predictions[0], self.untokenized_single_reference_predictions[0]],
             )
@@ -709,5 +768,5 @@ def run_tests():
     unittest.main(verbosity=2)
-if __name__ == '__main__':
     run_tests()

 from .encoder_models import SBertEncoder, get_encoder
 from .semf1 import SemF1, _compute_cosine_similarity, _validate_input_format
+from .utils import (
+    get_gpu,
+    slice_embeddings,
+    is_nested_list_of_type,
+    flatten_list,
+    compute_f1,
+    Scores,
+)
 class TestUtils(unittest.TestCase):
         self.assertEqual(get_gpu(1), 1 if gpu_available else "cpu")
         # Test list input with unique elements
+        self.assertEqual(
+            get_gpu([True, "cpu", 0]),
+            [0, "cpu"] if gpu_available else ["cpu", "cpu", "cpu"],
+        )
         # Test list input with duplicate elements
+        self.assertEqual(
+            get_gpu([0, 0, "gpu"]), 0 if gpu_available else ["cpu", "cpu", "cpu"]
+        )
         # Test list input with duplicate elements of different types
+        self.assertEqual(
+            get_gpu([True, 0, "gpu"]), 0 if gpu_available else ["cpu", "cpu", "cpu"]
+        )
         # Test list input but only one element
         self.assertEqual(get_gpu([True]), 0 if gpu_available else "cpu")
         # Test list input with all integers
+        self.assertEqual(
+            get_gpu(list(range(gpu_count))),
+            list(range(gpu_count)) if gpu_available else gpu_count * ["cpu"],
+        )
         with self.assertRaises(ValueError):
             get_gpu("invalid")
         num_sentences = [3, 2, 5]
         expected_output = [embeddings[:3], embeddings[3:5], embeddings[5:]]
         self.assertTrue(
+            all(
+                np.array_equal(a, b)
+                for a, b in zip(
+                    slice_embeddings(embeddings, num_sentences), expected_output
+                )
+            )
         )
         num_sentences_nested = [[2, 1], [3, 4]]
+        expected_output_nested = [
+            [embeddings[:2], embeddings[2:3]],
+            [embeddings[3:6], embeddings[6:]],
+        ]
         self.assertTrue(
             slice_embeddings(embeddings, num_sentences_nested), expected_output_nested
         )
         self.assertEqual(is_valid, False)
         # Test case: Depth 1, list of elements matching element_type
+        self.assertEqual(
+            is_nested_list_of_type(["apple", "banana"], str, 1), (True, "")
+        )
         # Test case: Depth 1, list of elements not matching element_type
         is_valid, err_msg = is_nested_list_of_type([1, 2, 3], str, 1)
         # Depth 2
         self.assertEqual(is_nested_list_of_type([[1, 2], [3, 4]], int, 2), (True, ""))
+        self.assertEqual(
+            is_nested_list_of_type([["1", "2"], ["3", "4"]], str, 2), (True, "")
+        )
         is_valid, err_msg = is_nested_list_of_type([[1, 2], ["a", "b"]], int, 2)
         self.assertEqual(is_valid, False)
         # Depth 3
         is_valid, err_msg = is_nested_list_of_type([[[1], [2]], [[3], [4]]], list, 3)
         self.assertEqual(is_valid, False)
+        self.assertEqual(
+            is_nested_list_of_type([[[1], [2]], [[3], [4]]], int, 3), (True, "")
+        )
         # Test case: Depth is negative, expecting ValueError
         with self.assertRaises(ValueError):
 class TestSBertEncoder(unittest.TestCase):
     def setUp(self, device=None):
         if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
             self.device = device
         self.model_name = "stsb-roberta-large"
         self.batch_size = 8
         self.verbose = False
+        self.encoder = SBertEncoder(self.model_name)
     def test_initialization(self):
         self.assertIsInstance(self.encoder.model, SentenceTransformer)
     def test_encode_single_device(self):
         sentences = ["This is a test sentence.", "Here is another sentence."]
+        embeddings = self.encoder.encode(
+            sentences,
+            device=self.device,
+            batch_size=self.batch_size,
+            verbose=self.verbose,
+        )
         self.assertIsInstance(embeddings, np.ndarray)
         self.assertEqual(embeddings.shape[0], len(sentences))
+        self.assertEqual(
+            embeddings.shape[1], self.encoder.model.get_sentence_embedding_dimension()
+        )
     def test_encode_multi_device(self):
         if torch.cuda.device_count() < 2:
             self.skipTest("Multi-GPU test requires at least 2 GPUs.")
         else:
+            # devices = ["cuda:0", "cuda:1"]
+            devices = [0, 1]
             self.setUp(devices)
+            sentences = [
+                "This is a test sentence.",
+                "Here is another sentence.",
+                "This is a test sentence.",
+            ]
+            embeddings = self.encoder.encode(
+                sentences,
+                device=devices,
+                batch_size=self.batch_size,
+                verbose=self.verbose,
+            )
             self.assertIsInstance(embeddings, np.ndarray)
             self.assertEqual(embeddings.shape[0], 3)
+            self.assertEqual(
+                embeddings.shape[1],
+                self.encoder.model.get_sentence_embedding_dimension(),
+            )
 class TestGetEncoder(unittest.TestCase):
         self.verbose = False
     def _base_test(self, model_name):
+        encoder = get_encoder(model_name)
         self.assertIsInstance(encoder, SBertEncoder)
     def test_get_sbert_encoder(self):
         model_name = "stsb-roberta-large"
         model_name = "roberta-base"
         self._base_test(model_name)
+    def test_get_encoder_environment_error(self):
         model_name = "abc"  # Wrong model_name
         with self.assertRaises(EnvironmentError):
+            get_encoder(model_name)
     def test_get_encoder_other_exception(self):
         model_name = "apple/OpenELM-270M"  # This model is not supported by SentenceTransformer lib
         with self.assertRaises(RuntimeError):
+            get_encoder(model_name)
 class TestSemF1(unittest.TestCase):
         # Example cases, #Samples = 1
         self.untokenized_single_reference_predictions = [
+            "This is a prediction sentence 1. This is a prediction sentence 2."
+        ]
         self.untokenized_single_reference_references = [
+            "This is a reference sentence 1. This is a reference sentence 2."
+        ]
         self.tokenized_single_reference_predictions = [
             ["This is a prediction sentence 1.", "This is a prediction sentence 2."],
             "Prediction sentence 1. Prediction sentence 2."
         ]
         self.untokenized_multi_reference_references = [
+            [
+                "Reference sentence 1. Reference sentence 2.",
+                "Alternative reference 1. Alternative reference 2.",
+            ],
         ]
         self.tokenized_multi_reference_predictions = [
         self.tokenized_multi_reference_references = [
             [
                 ["Reference sentence 1.", "Reference sentence 2."],
+                ["Alternative reference 1.", "Alternative reference 2."],
             ],
         ]
         self.multi_sample_refs = [
+            "this is the first reference sample",
+            "this is the second reference sample",
         ]
         self.multi_sample_preds = [
+            "this is the first prediction sample",
+            "this is the second prediction sample",
         ]
     def test_aggregate_multi_sample(self):
         """
+        check if a `Scores` class is returned instead of a list of
         `Scores`
         """
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_aggregate_untokenized_single_ref(self):
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_aggregate_tokenized_single_ref(self):
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_aggregate_untokenized_multi_ref(self):
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_aggregate_tokenized_multi_ref(self):
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_aggregate_same_pred_and_ref(self):
         scores = self.semf1_metric.compute(
             aggregate=True,
         )
         self.assertIsInstance(scores, Scores)
+        print(f"Score: {scores}")
     def test_untokenized_single_reference(self):
         scores = self.semf1_metric.compute(
             multi_references=False,
             gpu=False,
             batch_size=32,
+            verbose=False,
         )
         self.assertIsInstance(scores, list)
+        self.assertEqual(
+            len(scores), len(self.untokenized_single_reference_predictions)
+        )
     def test_tokenized_single_reference(self):
         scores = self.semf1_metric.compute(
             multi_references=False,
             gpu=False,
             batch_size=32,
+            verbose=False,
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.tokenized_single_reference_predictions))
             multi_references=True,
             gpu=False,
             batch_size=32,
+            verbose=False,
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.untokenized_multi_reference_predictions))
             multi_references=True,
             gpu=False,
             batch_size=32,
+            verbose=False,
         )
         self.assertIsInstance(scores, list)
         self.assertEqual(len(scores), len(self.tokenized_multi_reference_predictions))
             multi_references=False,
             gpu=False,
             batch_size=32,
+            verbose=False,
         )
         self.assertIsInstance(scores, list)
         for score in scores:
             self.assertIsInstance(score, Scores)
             self.assertAlmostEqual(score.precision, 1.0, places=6)
+            assert_almost_equal(
+                score.recall,
+                1,
+                decimal=5,
+                err_msg="Not all values are almost equal to 1",
+            )
     def test_exact_output_scores(self):
         predictions = [
             ["I am", "I am"],
             [None, "I am"],
         ]
+        print(
+            f"Case I\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n"
+        )
         # Case 2: tokenize_sentences = False, multi_references = True
         tokenize_sentences = False
             [["I am", "I am"], [None, "I am"]],
             [[None, "I am"]],
         ]
+        print(
+            f"Case II\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n"
+        )
         # Case 3: tokenize_sentences = True, multi_references = False
         tokenize_sentences = True
             "I am. I am.",
             "I am. I am.",
         ]
+        print(
+            f"Case III\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n"
+        )
         # Case 4: tokenize_sentences = False, multi_references = False
         # This is taken care by the library itself
             ["I am.", "I am."],
             ["I am.", "I am."],
         ]
+        print(
+            f"Case IV\n{_call_metric(predictions, references, tokenize_sentences, multi_references)}\n"
+        )
     def test_empty_input(self):
         predictions = ["", ""]
     def setUp(self):
         # Sample embeddings for testing
+        self.pred_embeds = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        self.ref_embeds = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
         self.pred_embeds_random = np.random.rand(3, 3)
         self.ref_embeds_random = np.random.rand(3, 3)
     def test_cosine_similarity_perfect_match(self):
+        precision, recall = _compute_cosine_similarity(
+            self.pred_embeds, self.ref_embeds
+        )
         # Expected values are 1.0 for both precision and recall since embeddings are identical
         self.assertAlmostEqual(precision, 1.0, places=5)
         self.assertAlmostEqual(recall, expected_recall, places=5)
     def test_cosine_similarity_random(self):
+        self._test_cosine_similarity_base(
+            self.pred_embeds_random, self.ref_embeds_random
+        )
     def test_cosine_similarity_different_shapes(self):
         pred_embeds_diff = np.random.rand(5, 3)
         self.untokenized_multi_reference_references = [
             [
                 "This is a reference sentence 1. This is a reference sentence 2.",
+                "Another reference sentence.",
             ]
         ]
         self.tokenized_multi_reference_references = [
             [
                 ["This is a reference sentence 1.", "This is a reference sentence 2."],
+                ["Another reference sentence."],
             ]
         ]
                 True,
                 True,
                 self.untokenized_single_reference_predictions,
+                [
+                    self.untokenized_single_reference_predictions[0],
+                    self.untokenized_single_reference_predictions[0],
+                ],
             )
     unittest.main(verbosity=2)
+if __name__ == "__main__":
     run_tests()