Alverciito
/

wikipedia_segmentation

@@ -137,6 +137,8 @@ Multiple segmentation strategies are evaluated, such as PELT, binary segmentatio
 | Neural (frozen LM) | XLM-R (frozen) + PELT / BinSeg / cosine similarity | Yes | No |
 | Proposed | CoseNet Transformer (sentence encoder + CoSeNet layering + candidate masking + pooling) | Yes | Yes |
 The benchmark entry point is **bench.py**, and results are stored as JSON files for reproducibility and further analysis.

 | Neural (frozen LM) | XLM-R (frozen) + PELT / BinSeg / cosine similarity | Yes | No |
 | Proposed | CoseNet Transformer (sentence encoder + CoSeNet layering + candidate masking + pooling) | Yes | Yes |
+**WindowDiff (WD)** is used as the primary segmentation error metric.
+Lower values indicate better segmentation quality. In this work, **WindowDiff values ≤ 0.30 are considered acceptable**, values **≤ 0.20 indicate good performance**, and values **≤ 0.10 indicate strong segmentation accuracy** under standard evaluation settings.
 The benchmark entry point is **bench.py**, and results are stored as JSON files for reproducibility and further analysis.

model.py CHANGED Viewed

@@ -214,6 +214,24 @@ class SentenceCoseNet(PreTrainedModel):
         )
         return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
     def forward(
         self,
         input_ids: torch.Tensor,

         )
         return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
+    def similarity(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        Compute cosine similarity scores between two sets of embeddings.
+        Args:
+            embeddings (torch.Tensor):
+                Tensor of shape (B, S, D) containing two sets of
+                embeddings concatenated along the first dimension.
+        Returns:
+            torch.Tensor:
+                Similarity scores of shape (B, S, S)
+        """
+        # Normalize embeddings
+        x = self.model.distance_layer(embeddings)
+        # Compute cosine similarity
+        return x
     def forward(
         self,
         input_ids: torch.Tensor,

research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+import torch
+import tqdm
+from transformers import AutoTokenizer, AutoModel
+from datasets import load_dataset
+def zero_shot_proposed(
+    model_repo: str = None,
+    data_repo: str = None,
+    batch_size: int = 32,
+    logit_th: float = 1.9,
+    device: torch.device = torch.device('cpu')
+):
+    """
+    """
+    # Pathing:
+    if model_repo is None:
+        model_repo = input("Enter the model repository path or identifier: ")
+    if data_repo is None:
+        data_repo = input("Enter the dataset repository path or identifier: ")
+    # Loading:
+    tokenizer = AutoTokenizer.from_pretrained(model_repo)
+    model = AutoModel.from_pretrained(model_repo)
+    dataset = load_dataset(data_repo)
+    for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
+        inputs = tokenizer(..., return_tensors="pt", padding=True, truncation=True)
+        embeddings = model.get_sentence_embedding(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], normalize=True)
+        similarity = model.similarity(embeddings)
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #