alverciito commited on
Commit ·
d0a3e2d
1
Parent(s): 0b2800e
add zero shot experiment and similarity method
Browse files- README.md +2 -0
- model.py +18 -0
- research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py +44 -0
README.md
CHANGED
|
@@ -137,6 +137,8 @@ Multiple segmentation strategies are evaluated, such as PELT, binary segmentatio
|
|
| 137 |
| Neural (frozen LM) | XLM-R (frozen) + PELT / BinSeg / cosine similarity | Yes | No |
|
| 138 |
| Proposed | CoseNet Transformer (sentence encoder + CoSeNet layering + candidate masking + pooling) | Yes | Yes |
|
| 139 |
|
|
|
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
The benchmark entry point is **bench.py**, and results are stored as JSON files for reproducibility and further analysis.
|
|
|
|
| 137 |
| Neural (frozen LM) | XLM-R (frozen) + PELT / BinSeg / cosine similarity | Yes | No |
|
| 138 |
| Proposed | CoseNet Transformer (sentence encoder + CoSeNet layering + candidate masking + pooling) | Yes | Yes |
|
| 139 |
|
| 140 |
+
**WindowDiff (WD)** is used as the primary segmentation error metric.
|
| 141 |
+
Lower values indicate better segmentation quality. In this work, **WindowDiff values ≤ 0.30 are considered acceptable**, values **≤ 0.20 indicate good performance**, and values **≤ 0.10 indicate strong segmentation accuracy** under standard evaluation settings.
|
| 142 |
|
| 143 |
|
| 144 |
The benchmark entry point is **bench.py**, and results are stored as JSON files for reproducibility and further analysis.
|
model.py
CHANGED
|
@@ -214,6 +214,24 @@ class SentenceCoseNet(PreTrainedModel):
|
|
| 214 |
)
|
| 215 |
return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
def forward(
|
| 218 |
self,
|
| 219 |
input_ids: torch.Tensor,
|
|
|
|
| 214 |
)
|
| 215 |
return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
|
| 216 |
|
| 217 |
+
def similarity(self, embeddings: torch.Tensor) -> torch.Tensor:
|
| 218 |
+
"""
|
| 219 |
+
Compute cosine similarity scores between two sets of embeddings.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
embeddings (torch.Tensor):
|
| 223 |
+
Tensor of shape (B, S, D) containing two sets of
|
| 224 |
+
embeddings concatenated along the first dimension.
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
torch.Tensor:
|
| 228 |
+
Similarity scores of shape (B, S, S)
|
| 229 |
+
"""
|
| 230 |
+
# Normalize embeddings
|
| 231 |
+
x = self.model.distance_layer(embeddings)
|
| 232 |
+
# Compute cosine similarity
|
| 233 |
+
return x
|
| 234 |
+
|
| 235 |
def forward(
|
| 236 |
self,
|
| 237 |
input_ids: torch.Tensor,
|
research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 2 |
+
# #
|
| 3 |
+
# This file was created by: Alberto Palomo Alonso #
|
| 4 |
+
# Universidad de Alcalá - Escuela Politécnica Superior #
|
| 5 |
+
# #
|
| 6 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 7 |
+
# Import statements:
|
| 8 |
+
import torch
|
| 9 |
+
import tqdm
|
| 10 |
+
from transformers import AutoTokenizer, AutoModel
|
| 11 |
+
from datasets import load_dataset
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def zero_shot_proposed(
|
| 15 |
+
model_repo: str = None,
|
| 16 |
+
data_repo: str = None,
|
| 17 |
+
batch_size: int = 32,
|
| 18 |
+
logit_th: float = 1.9,
|
| 19 |
+
device: torch.device = torch.device('cpu')
|
| 20 |
+
):
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
# Pathing:
|
| 25 |
+
if model_repo is None:
|
| 26 |
+
model_repo = input("Enter the model repository path or identifier: ")
|
| 27 |
+
if data_repo is None:
|
| 28 |
+
data_repo = input("Enter the dataset repository path or identifier: ")
|
| 29 |
+
|
| 30 |
+
# Loading:
|
| 31 |
+
tokenizer = AutoTokenizer.from_pretrained(model_repo)
|
| 32 |
+
model = AutoModel.from_pretrained(model_repo)
|
| 33 |
+
dataset = load_dataset(data_repo)
|
| 34 |
+
|
| 35 |
+
for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
inputs = tokenizer(..., return_tensors="pt", padding=True, truncation=True)
|
| 39 |
+
embeddings = model.get_sentence_embedding(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], normalize=True)
|
| 40 |
+
similarity = model.similarity(embeddings)
|
| 41 |
+
|
| 42 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 43 |
+
# END OF FILE #
|
| 44 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|