Spaces:

kadarakos
/

labeled

Sleeping

App Files Files Community

kadarakos commited on Feb 26

Commit

ba006b9

1 Parent(s): f84045f

init

Browse files

Files changed (9) hide show

.gitignore +10 -0
.python-version +1 -0
pyproject.toml +27 -0
src/mentioned/__init__.py +2 -0
src/mentioned/data.py +137 -0
src/mentioned/inference.py +211 -0
src/mentioned/model.py +331 -0
src/mentioned/py.typed +0 -0
src/mentioned/train.py +69 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[project]
+name = "mentioned"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "kadarakos", email = "kadar.akos@gmail.com" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "datasets>=4.6.0",
+    "huggingface-hub>=1.4.1",
+    "lightning>=2.6.1",
+    "torch>=2.10.0",
+    "torchmetrics>=1.8.2",
+    "transformers>=5.2.0",
+    "wandb>=0.25.0",
+]
+[build-system]
+requires = ["uv_build>=0.9.9,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "ruff>=0.15.2",
+]

src/mentioned/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def hello() -> str:
2	+ return "Hello from mentioned!"

src/mentioned/data.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from collections import defaultdict
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset, DataLoader
+from datasets import load_dataset
+def mentions_by_sentence(example):
+    mentions_per_sentence = defaultdict(list)
+    for cluster in example["coref_chains"]:
+        for mention in cluster:
+            sent_idx, start, end = mention
+            # In the ArrowDataset have to use str or byte as key.
+            mentions_per_sentence[str(sent_idx)].append((start, end))
+    example["mentions"] = mentions_per_sentence
+    return example
+def flatten_to_sentences(batch):
+    new_batch = {"sentence": [], "mentions": []}
+    # Ensure we are iterating over the lists in the batch
+    for sentences, mentions_dict in zip(batch["sentences"], batch["mentions"]):
+        # Some versions of datasets might save dicts as None if empty
+        if mentions_dict is None:
+            mentions_dict = {}
+        for i, sent in enumerate(sentences):
+            # Safe access: get the list of mentions or empty list
+            sent_mentions = mentions_dict.get(str(i), [])
+            new_batch["sentence"].append(sent)
+            new_batch["mentions"].append(sent_mentions)
+    return new_batch
+class LitBankStringDataset(Dataset):
+    def __init__(self, hf_dataset):
+        self.dataset = hf_dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        tokens = item["sentence"]
+        # The ArrowDataset gives None for [].
+        mentions = item["mentions"] if item["mentions"] is not None else []
+        n_tokens = len(tokens)
+        starts = torch.zeros(n_tokens, dtype=torch.long)
+        span_labels = torch.zeros((n_tokens, n_tokens), dtype=torch.long)
+        for s, e in mentions:
+            # Ensure indices are within bounds (LitBank e is often inclusive)
+            if s < n_tokens and e < n_tokens:
+                starts[s] = 1
+                span_labels[s, e] = 1
+        return {
+            "tokens": tokens,
+            "starts": starts,
+            "span_labels": span_labels,
+        }
+def collate_fn(batch):
+    sentences = [item["tokens"] for item in batch]
+    # Padding up to longest sentence.
+    max_len = max(len(s) for s in sentences)
+    starts_list = []  # 0 - 1 indicator for start tokens.
+    spans_list = []  # 0 - 1 indicator for (start, end) pairs.
+    for item in batch:
+        curr_len = len(item["starts"])
+        starts_list.append(item["starts"])
+        padded_span = torch.zeros((max_len, max_len), dtype=torch.long)
+        padded_span[:curr_len, :curr_len] = item["span_labels"]
+        spans_list.append(padded_span)
+    # 1D padding for token classification.
+    starts_padded = pad_sequence(starts_list, batch_first=True, padding_value=-1)
+    token_mask = starts_padded != -1
+    starts_padded[starts_padded == -1] = 0
+    # 2D padding for token-pair classification: B x N x N
+    spans_padded = torch.stack(spans_list)
+    # 2D length mask (like attention): B x N x 1 & B x 1 x N -> (B, N, N)
+    valid_len_mask = token_mask.unsqueeze(2) & token_mask.unsqueeze(1)
+    # 2. Causal j >= i mask: B x N x N
+    upper_tri_mask = torch.triu(
+        torch.ones((max_len, max_len), dtype=torch.bool),
+        diagonal=0,
+    )
+    # Mask all positions that are not corresponding to a start token: (B X N X 1)
+    is_start_mask = starts_padded.unsqueeze(2).bool()
+    # Full mask is "and"ing all masks together (like attention): B x N x N
+    span_loss_mask = valid_len_mask & upper_tri_mask & is_start_mask
+    return {
+        "sentences": sentences,
+        "starts": starts_padded,  # (B, N) - Targets for start classifier
+        "spans": spans_padded,  # (B, N, N) - Targets for span classifier
+        "token_mask": token_mask,  # (B, N) - For 1D loss
+        "span_loss_mask": span_loss_mask,  # (B, N, N) - For 2D loss
+    }
+def make_litbank() -> tuple[DataLoader, DataLoader, DataLoader]:
+    """Reformat litbank to as a sentence-level mention-detection dataset."""
+    litbank = load_dataset("coref-data/litbank_raw", "split_0")
+    litbank_sentences_mentions = litbank.map(mentions_by_sentence).map(
+        flatten_to_sentences, batched=True, remove_columns=litbank["train"].column_names
+    )
+    no = 0
+    for i in range(len(litbank_sentences_mentions["train"])):
+        mentions = litbank_sentences_mentions["train"][i]["mentions"]
+        # Check if None or empty
+        if mentions is None or len(mentions) == 0:
+            no += 1
+    print(f"Training sentences without mentions: {no}.")
+    train = LitBankStringDataset(litbank_sentences_mentions["train"])
+    val = LitBankStringDataset(litbank_sentences_mentions["validation"])
+    test = LitBankStringDataset(litbank_sentences_mentions["test"])
+    train_loader = DataLoader(train, batch_size=4, shuffle=True, collate_fn=collate_fn)
+    val_loader = DataLoader(val, batch_size=4, shuffle=False, collate_fn=collate_fn)
+    test_loader = DataLoader(test, batch_size=4, shuffle=False, collate_fn=collate_fn)
+    # Sanity check
+    try:
+        next(iter(train_loader))
+    except Exception as e:
+        raise e
+    return train_loader, val_loader, test_loader

src/mentioned/inference.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+import torch.nn as nn
+from mentioned.model import make_model_v1, LitMentionDetector
+class InferenceMentionDetector(nn.Module):
+    def __init__(self, encoder, mention_detector):
+        super().__init__()
+        self.encoder = encoder
+        self.mention_detector = mention_detector
+    def forward(self, input_ids, attention_mask, word_ids):
+        """
+        Inputs (Tensors):
+            input_ids: (B, Seq_Len)
+            attention_mask: (B, Seq_Len)
+            word_ids: (B, Seq_Len) -> Word index per token, -1 for special tokens
+        Returns (Tensors):
+            start_probs: (B, Num_Words)
+            end_probs: (B, Num_Words, Num_Words)
+        """
+        # 1. Subword-to-Word Pooling (The vectorized logic we wrote earlier)
+        # Returns: (Batch, Num_Words, Hidden_Dim)
+        word_embeddings = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            word_ids=word_ids
+        )
+        # 2. Mention Detection Logic
+        # Returns: start_logits (B, W), end_logits (B, W, W)
+        start_logits, end_logits = self.mention_detector(word_embeddings)
+        # 3. Probabilities for Inference
+        # Applying sigmoid here makes the ONNX model output final scores
+        start_probs = torch.sigmoid(start_logits)
+        end_probs = torch.sigmoid(end_logits)
+        return start_probs, end_probs
+class MentionProcessor:
+    def __init__(self, tokenizer, max_length: int = 512):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __call__(self, docs: list[list[str]]):
+        """
+        Converts raw word lists into tensors for the ONNX model.
+        Args:
+            docs: List of documents, where each doc is a list of words.
+                  Example: [["Hello", "world"], ["Testing", "this"]]
+        """
+        # 1. Standard Tokenization
+        # is_split_into_words=True is crucial since your input is list[list[str]]
+        inputs = self.tokenizer(
+            docs,
+            is_split_into_words=True,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.max_length,
+            padding=True,
+            return_attention_mask=True
+        )
+        # 2. Map Subwords to Word IDs
+        # We need a tensor where each token index maps to its word index.
+        # Special tokens (<s>, </s>, <pad>) are mapped to -1 to be ignored by pooling.
+        batch_word_ids = []
+        for i in range(len(docs)):
+            # tokenizer.word_ids(i) returns [None, 0, 1, 1, 2, None]
+            w_ids = [w if w is not None else -1 for w in inputs.word_ids(batch_index=i)]
+            batch_word_ids.append(torch.tensor(w_ids))
+        # 3. Stack into a batch tensor (Batch, Seq_Len)
+        word_ids_tensor = torch.stack(batch_word_ids)
+        return {
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            "word_ids": word_ids_tensor
+        }
+class MentionDetectorPipeline:
+    def __init__(self, model, tokenizer, threshold: float = 0.5):
+        """
+        Args:
+            model: The InferenceMentionDetector (PyTorch or ONNX Session)
+            tokenizer: The PreTrainedTokenizer
+            threshold: Probability threshold to consider a mention valid
+        """
+        self.model = model.eval()
+        self.processor = MentionProcessor(tokenizer, model.max_length)
+        self.threshold = threshold
+    @torch.no_grad()
+    def predict(self, docs: list[list[str]]):
+        """
+        Args:
+            docs: List of documents (each is a list of words)
+        Returns:
+            List of lists containing dicts: {"start": int, "end": int, "score": float}
+        """
+        # 1. Preprocess to Tensors
+        batch = self.processor(docs)
+        device = next(self.model.parameters()).device
+        # Move batch to model device
+        batch = {k: v.to(device) for k, v in batch.items()}
+        # 2. Forward Pass
+        # start_probs: (B, W), end_probs: (B, W, W)
+        start_probs, end_probs = self.model(**batch)
+        # 3. Post-process: Extract Mentions
+        results = []
+        for i in range(len(docs)):
+            doc_mentions = []
+            doc_len = len(docs[i])
+            # We only look at the valid word range for this specific document
+            # end_probs[i] is a (W, W) matrix where [s, e] is the prob of span s->e
+            valid_spans = (end_probs[i][:doc_len, :doc_len] > self.threshold).nonzero()
+            for span in valid_spans:
+                start_idx = span[0].item()
+                end_idx = span[1].item()
+                # Logic: Only valid if end >= start
+                if end_idx >= start_idx:
+                    score = end_probs[i, start_idx, end_idx].item()
+                    doc_mentions.append({
+                        "start": start_idx,
+                        "end": end_idx,
+                        "score": round(score, 4),
+                        "text": " ".join(docs[i][start_idx : end_idx + 1])
+                    })
+            results.append(doc_mentions)
+        return results
+def create_inference_model(repo_id: str, device: str = "cpu"):
+    """
+    Factory to load a trained model from HF Hub and wrap it for ONNX/Inference.
+    """
+    # 1. Load the Lightning model (with its weights)
+    # Note: Ensure LitMentionDetector is defined in your scope
+    fresh_model = make_model_v1()
+    lit_model = LitMentionDetector.from_pretrained(
+        repo_id,
+        tokenizer=fresh_model.tokenizer,
+        encoder=fresh_model.encoder,
+        mention_detector=fresh_model.mention_detector,
+    )
+    # 2. Move to device and set to eval mode
+    lit_model.to(device)
+    lit_model.eval()
+    # 3. Wrap the core components into the Inference class
+    # This separates the 'training' logic from the 'inference' graph
+    inference_model = InferenceMentionDetector(
+        encoder=lit_model.encoder,
+        mention_detector=lit_model.mention_detector
+    )
+    # 4. Attach the tokenizer and max_length for the Preprocessor
+    # (Optional: helpful for keeping metadata together)
+    inference_model.tokenizer = lit_model.tokenizer
+    inference_model.max_length = lit_model.encoder.max_length
+    return inference_model.eval()
+# TODO
+def compile_inference_model(model):
+    return model
+repo_id = "kadarakos/mention-detector-poc-dry-run"
+inference_model = compile_inference_model(
+    create_inference_model(repo_id)
+)
+pipeline = MentionDetectorPipeline(
+    model=inference_model,
+    tokenizer=inference_model.tokenizer,
+    threshold=0.6  # Noticed that precision is bad below this (still bad :D).
+)
+docs = [
+    "Does this model actually work?".split(),
+    "The name of the mage is Bubba.".split(),
+    "It was quite a sunny day when the model finally started working.".split(),
+    "Albert Einstein was a theoretical physicist who developed the theory of relativity".split(),
+    "Apple Inc. and Microsoft are competing in the cloud computing market".split(),
+    "New York City is often called the Big Apple".split(),
+    "The Great Barrier Reef is the world's largest coral reef system".split(),
+    "Marie Curie was the first woman to win a Nobel Prize".split()
+]
+batch_mentions = pipeline.predict(docs)
+for i, mentions in enumerate(batch_mentions):
+    print(docs[i])
+    for mention in mentions:
+        print(mention["text"])

src/mentioned/model.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import torch
+import torchmetrics
+from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import PyTorchModelHubMixin
+from lightning import LightningModule
+class SentenceEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        model_name: str = "distilroberta-base",
+        max_length: int = 512,
+    ):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        self.encoder = AutoModel.from_pretrained(model_name)
+        self.max_length = max_length
+        self.dim = self.encoder.config.hidden_size
+        self.stats = {}
+    def forward(self, input_ids, attention_mask, word_ids):
+        """
+        Args:
+            input_ids: (batch, seq_len)
+            attention_mask: (batch, seq_len)
+            word_ids: (batch, seq_len) -> Pre-computed word indices,
+                      use -1 for special tokens/padding.
+        """
+        # 1. Get Transformer Output
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        subword_embeddings = outputs.last_hidden_state  # (B, S, D)
+        num_words = word_ids.max() + 1
+        word_mask = word_ids.unsqueeze(-1) == torch.arange(
+            num_words, device=word_ids.device
+        )
+        word_mask = word_mask.float()  # (B, S, W)
+        # Sum embeddings for each word: (B, W, S) @ (B, S, D) -> (B, W, D)
+        word_sums = torch.bmm(word_mask.transpose(1, 2), subword_embeddings)
+        # Count subwords per word to get the denominator
+        # (B, W, S) @ (B, S, 1) -> (B, W, 1)
+        subword_counts = word_mask.sum(dim=1).unsqueeze(-1).clamp(min=1e-9)
+        # (B, W, D)
+        word_embeddings = word_sums / subword_counts
+        return word_embeddings
+class SwiGLU(torch.nn.Module):
+    def __init__(self, dim: int, hidden_dim: int = None):
+        super().__init__()
+        # Common expansion factor
+        if hidden_dim is None:
+            hidden_dim = 2 * dim
+        self.w1 = torch.nn.Linear(dim, hidden_dim)
+        self.w3 = torch.nn.Linear(dim, hidden_dim)
+        self.w2 = torch.nn.Linear(hidden_dim, dim)
+        self.silu = torch.nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.silu(self.w1(x))
+        x = gate * self.w3(x)
+        x = self.w2(x)
+        return x
+class Detector(torch.nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int):
+        super().__init__()
+        # A 2-layer MLP is standard for span detection to capture interactions
+        self.net = torch.nn.Sequential(
+            torch.nn.Linear(input_dim, hidden_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_dim, 1),  # Output a single logit per token/pair
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (B, N, input_dim) for start detection
+               (B, N, N, input_dim) for end detection
+        Returns:
+            logits: (B, N) or (B, N, N)
+        """
+        return self.net(x)
+class MentionDetectorCore(torch.nn.Module):
+    def __init__(
+        self,
+        start_detector: Detector,
+        end_detector: Detector,
+    ):
+        super().__init__()
+        self.start_detector = start_detector
+        self.end_detector = end_detector
+    def forward(self, emb: torch.Tensor):
+        """
+        Args:
+            emb: (Batch, Seq_Len, Hidden_Dim)
+        Returns:
+            start_logits: (Batch, Seq_Len)
+            end_logits:   (Batch, Seq_Len, Seq_Len)
+        """
+        B, N, H = emb.shape
+        start_logits = self.start_detector(emb).squeeze(-1)
+        start_rep = emb.unsqueeze(2).expand(-1, -1, N, -1)
+        end_rep = emb.unsqueeze(1).expand(-1, N, -1, -1)
+        pair_emb = torch.cat([start_rep, end_rep], dim=-1)
+        end_logits = self.end_detector(pair_emb).squeeze(-1)
+        return start_logits, end_logits
+class LitMentionDetector(LightningModule, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        tokenizer,  #: transformers.PreTrainedTokenizer,
+        encoder: torch.nn.Module,
+        mention_detector: torch.nn.Module,
+        lr: float = 2e-5,
+        threshold: float = 0.5,
+    ):
+        super().__init__()
+        self.save_hyperparameters(ignore=["encoder", "start_detector", "end_detector"])
+        self.tokenizer = tokenizer
+        self.encoder = encoder
+        # Freeze all encoder parameters
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        self.mention_detector = mention_detector
+        self.loss_fn = torch.nn.BCEWithLogitsLoss(reduction="none")
+        # Two separate metrics for the two tasks
+        self.val_f1_start = torchmetrics.classification.BinaryF1Score()
+        self.val_f1_end = torchmetrics.classification.BinaryF1Score()
+        self.val_f1_mention = torchmetrics.classification.BinaryF1Score()
+    def encode(self, docs: list[list[str]]):
+        """
+        Handles the non-vectorized tokenization and calls the vectorized encoder.
+        """
+        device = next(self.parameters()).device
+        inputs = self.tokenizer(
+            docs,
+            is_split_into_words=True,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.encoder.max_length,
+            padding=True,
+            return_attention_mask=True,
+            return_offsets_mapping=True,  # needed for word_ids
+        )
+        input_ids = inputs["input_ids"].to(device)
+        attention_mask = inputs["attention_mask"].to(device)
+        batch_word_ids = []
+        for i in range(len(docs)):
+            w_ids = [w if w is not None else -1 for w in inputs.word_ids(batch_index=i)]
+            batch_word_ids.append(torch.tensor(w_ids))
+        word_ids_tensor = torch.stack(batch_word_ids).to(device)
+        word_embeddings = self.encoder(
+            input_ids=input_ids, attention_mask=attention_mask, word_ids=word_ids_tensor
+        )
+        return word_embeddings
+    def forward(self, emb: torch.Tensor):
+        start_logits, end_logits = self.mention_detector(emb)
+        return start_logits, end_logits
+    def _compute_start_loss(self, start_logits, batch):
+        targets = batch["starts"].float()
+        mask = batch["token_mask"].bool()
+        return self.loss_fn(start_logits, targets)[mask].mean()
+    def _compute_end_loss(self, end_logits, batch):
+        targets = batch["spans"].float()
+        mask = batch["span_loss_mask"].bool()
+        raw_loss = self.loss_fn(end_logits, targets)
+        relevant_loss = raw_loss[mask]
+        if relevant_loss.numel() == 0:
+            return end_logits.sum() * 0
+        return relevant_loss.mean()
+    def training_step(self, batch, batch_idx):
+        emb = self.encode(batch["sentences"])
+        start_logits, end_logits = self.forward(emb)
+        loss_start = self._compute_start_loss(start_logits, batch)
+        loss_end = self._compute_end_loss(end_logits, batch)
+        total_loss = loss_start + loss_end
+        self.log_dict(
+            {
+                "train_loss": total_loss,
+                "train_start_loss": loss_start,
+                "train_end_loss": loss_end,
+            },
+            prog_bar=True,
+        )
+        return total_loss
+    def validation_step(self, batch, batch_idx):
+        emb = self.encode(batch["sentences"])
+        start_logits, end_logits = self.forward(emb)
+        token_mask = batch["token_mask"].bool()
+        span_loss_mask = batch["span_loss_mask"].bool()
+        # --- METRIC 1: Start Detection (Diagnostic) ---
+        start_preds = (
+            torch.sigmoid(start_logits[token_mask]) > self.hparams.threshold
+        ).int()
+        start_targets = batch["starts"][token_mask].int()
+        if start_targets.numel() > 0:
+            self.val_f1_start.update(start_preds, start_targets)
+        # --- METRIC 2: End Detection (Diagnostic / Teacher Forced) ---
+        # Evaluates end-detector ONLY on ground-truth start positions
+        end_preds_diag = (
+            torch.sigmoid(end_logits[span_loss_mask]) > self.hparams.threshold
+        ).int()
+        end_targets_diag = batch["spans"][span_loss_mask].int()
+        if end_targets_diag.numel() > 0:
+            self.val_f1_end.update(end_preds_diag, end_targets_diag)
+        # --- METRIC 3: Full Mention Detection (The "Final Boss") ---
+        # A mention is correct only if BOTH start and end are predicted correctly.
+        # Combined probability: P(Start) * P(End)
+        combined_probs = torch.sigmoid(start_logits).unsqueeze(2) * torch.sigmoid(
+            end_logits
+        )
+        # We evaluate every possible pair in the valid upper triangle of the sentence
+        # (Excluding padding and j < i)
+        valid_pair_mask = token_mask.unsqueeze(2) & token_mask.unsqueeze(1)
+        upper_tri = torch.triu(torch.ones_like(end_logits), diagonal=0).bool()
+        mention_eval_mask = valid_pair_mask & upper_tri
+        mention_preds = (
+            combined_probs[mention_eval_mask] > self.hparams.threshold
+        ).int()
+        mention_targets = batch["spans"][mention_eval_mask].int()
+        if mention_targets.numel() > 0:
+            self.val_f1_mention.update(mention_preds, mention_targets)
+        # --- 4. Logging ---
+        start_loss = self._compute_start_loss(start_logits, batch)
+        end_loss = self._compute_end_loss(end_logits, batch)
+        self.log_dict(
+            {
+                "val_loss": start_loss + end_loss,
+                "val_f1_start": self.val_f1_start,
+                "val_f1_end": self.val_f1_end,
+                "val_f1_mention": self.val_f1_mention,
+            },
+            prog_bar=True,
+            batch_size=len(batch["sentences"]),
+            on_epoch=True,
+        )
+    @torch.no_grad()
+    def predict_mentions(
+        self, sentences: list[list[str]], batch_size: int = 2
+    ) -> list[list[tuple[int, int]]]:
+        """
+        Args:
+            sentences: A list of tokenized sentences.
+        Returns:
+            A list (per sentence) of lists containing (start_idx, end_idx) tuples.
+        """
+        self.eval()
+        all_results = []
+        # Process in batches to avoid OOM on large datasets
+        for i in range(0, len(sentences), batch_size):
+            batch_sentences = sentences[i : i + batch_size]
+            emb = self.encoder(batch_sentences)  # (B, N, H)
+            B, N, _ = emb.shape
+            start_logits, end_logits = self.forward(emb)
+            start_probs = torch.sigmoid(start_logits)  # (B, N)
+            end_probs = torch.sigmoid(end_logits)  # (B, N, N)
+            # 3. Calculate Joint Confidence
+            # (B, N, 1) * (B, N, N) -> (B, N, N)
+            combined_probs = start_probs.unsqueeze(2) * end_probs
+            # 4. Filter by Constraints (Upper Triangle & Threshold)
+            # Create mask for j >= i
+            upper_tri = torch.triu(
+                torch.ones((N, N), device=self.device), diagonal=0
+            ).bool()
+            # Apply threshold and upper triangle constraint
+            pred_mask = (combined_probs > self.hparams.threshold) & upper_tri
+            # 5. Extract Indices
+            # nonzero() returns [batch_idx, start_idx, end_idx]
+            indices = pred_mask.nonzero()
+            # Organize results back into a list of lists (one per sentence in batch)
+            batch_results = [[] for _ in range(len(batch_sentences))]
+            for b_idx, s_idx, e_idx in indices:
+                # Convert to standard Python ints for the final output
+                batch_results[b_idx.item()].append((s_idx.item(), e_idx.item()))
+            all_results.extend(batch_results)
+        return all_results
+    def test_step(self, batch, batch_idx):
+        # Reuse all the logic from validation_step
+        return self.validation_step(batch, batch_idx)
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
+def make_model_v1(model_name="distilroberta-base"):
+    dim = 768
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    encoder = SentenceEncoder(model_name).train()
+    encoder.train()
+    start_detector = Detector(dim, dim)
+    end_detector = Detector(dim * 2, dim)
+    mention_detector = MentionDetectorCore(start_detector, end_detector)
+    return LitMentionDetector(tokenizer, encoder, mention_detector)

src/mentioned/py.typed ADDED Viewed

File without changes

src/mentioned/train.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import wandb
+from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.callbacks.early_stopping import EarlyStopping
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning import Trainer
+from mentioned.model import make_model_v1, LitMentionDetector
+from mentions.data import make_litbank
+def train():
+    train_loader, val_loader, test_loader = make_litbank()
+    model = make_model_v1()
+    wandb_logger = WandbLogger(
+        project="mention-detector-poc",
+        name="distilroberta-frozen-encoder",
+    )
+    # Save only the best model for the PoC purposes.
+    best_checkpoint = ModelCheckpoint(
+        monitor="val_f1_mention",
+        mode="max",
+        save_top_k=1,
+        filename="best-mention-f1",
+        verbose=True,
+    )
+    early_stopper = EarlyStopping(
+        monitor="val_f1_mention",
+        min_delta=0.01,
+        patience=5,
+        verbose=True,
+        mode="max",
+    )
+    trainer = Trainer(
+        val_check_interval=1000,
+        check_val_every_n_epoch=None,
+        callbacks=[early_stopper, best_checkpoint],
+        logger=wandb_logger,
+    )
+    trainer.fit(
+        model=model,
+        train_dataloaders=train_loader,
+        val_dataloaders=val_loader,
+    )
+    trainer.test(dataloaders=test_loader, ckpt_path="best", weights_only=False)
+    fresh_model = make_model_v1()
+    best_model = LitMentionDetector.load_from_checkpoint(
+        trainer.checkpoint_callback.best_model_path,
+        tokenizer=fresh_model.tokenizer,
+        encoder=fresh_model.encoder,
+        mention_detector=fresh_model.mention_detector,
+        weights_only=False,
+    )
+    best_model.push_to_hub("kadarakos/mention-detector-poc-dry-run", private=True)
+    wandb.finish()
+    ### Test pull:
+    fresh_model = make_model_v1()
+    repo_id = "kadarakos/mention-detector-poc-dry-run"
+    remote_model = LitMentionDetector.from_pretrained(
+        repo_id,
+        tokenizer=fresh_model.tokenizer,
+        encoder=fresh_model.encoder,
+        mention_detector=fresh_model.mention_detector,
+    )
+    # 3. Final Verification
+    verify_trainer = Trainer(accelerator="auto", logger=False)
+    verify_trainer.test(model=remote_model, dataloaders=test_loader)