Upload DisamBert

Browse files

Files changed (3) hide show

DisamBert.py +121 -55
config.json +1 -0
model.safetensors +1 -1

DisamBert.py CHANGED Viewed

@@ -1,11 +1,20 @@
 from collections.abc import Generator, Iterable
 from dataclasses import dataclass
 from enum import StrEnum
 import pandas as pd
 import torch
 import torch.nn as nn
-from transformers import AutoConfig, AutoModel, AutoTokenizer, ModernBertModel, PreTrainedConfig, PreTrainedModel
 BATCH_SIZE = 64
@@ -36,10 +45,13 @@ class DisamBert(PreTrainedModel):
             self.__entities = None
         else:
             self.BaseModel = ModernBertModel(config)
-            self.classifier_head = nn.Parameter(torch.empty((config.vocab_size,config.hidden_size)))
-            self._entities
         config.init_basemodel = False
         self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
         self.post_init()
     @classmethod
@@ -82,51 +94,58 @@ class DisamBert(PreTrainedModel):
             self.__entities = pd.Series(self.config.entities)
         return self.__entities
-    def forward(self, sentences: Iterable[str], indices: Iterable[list[int]]) -> torch.Tensor:
         assert not nn.parameter.is_lazy(self.classifier_head), (
             "Run init_classifier to initialise weights"
         )
-        all_indices = []
-        all_tokens = []
-        with self.BaseModel.device:
-            for sentence, span_indices in zip(sentences, indices, strict=True):
-                indices = []
-                tokens = []
-                last_span = len(span_indices) - 2
-                for i, position in enumerate(span_indices[:-1]):
-                    span = sentence[position : span_indices[i + 1]]
-                    span_tokens = self.tokenizer([span], padding=False)["input_ids"][0]
-                    if i > 0:
-                        span_tokens = span_tokens[1:]
-                    if i < last_span:
-                        span_tokens = span_tokens[:-1]
-                    indices.append(len(span_tokens))
-                    tokens.extend(span_tokens)
-                all_indices.append(indices)
-                all_tokens.append(tokens)
-            sentence_lengths = [len(boundaries) for boundaries in all_indices]
-            maxlen = max(sentence_lengths)
-            batch = self.pad(all_tokens)
-            token_vectors = self.BaseModel(batch.input_ids, batch.attention_mask).last_hidden_state
-            span_vectors = torch.cat(
-                [
-                    torch.vstack(
-                        [
-                            torch.sum(chunk, dim=0)
-                            for chunk in self.split(token_vectors[i], sentence_indices)
-                        ]
-                    )
-                    for (i, sentence_indices) in enumerate(all_indices)
-                ]
-            )
-            logits = torch.einsum("ij,kj->ki", span_vectors, self.classifier_head)
-            split_logits = torch.split(logits, sentence_lengths, dim=1)
-            return torch.stack(
-                [
-                    self.extend_to_max_length(sentence, length, maxlen)
-                    for (sentence, length) in zip(split_logits, sentence_lengths, strict=True)
-                ]
-            )
     def split(self, vectors: torch.Tensor, lengths: list[int]) -> tuple[torch.Tensor, ...]:
         maxlen = vectors.shape[0]
@@ -135,7 +154,7 @@ class DisamBert(PreTrainedModel):
         chunks = vectors.split((lengths + [maxlen - total_length]) if is_padded else lengths)
         return chunks[:-1] if is_padded else chunks
-    def pad(self, tokens: list[int]) -> PaddedBatch:
         lengths = [len(sentence) for sentence in tokens]
         maxlen = max(lengths)
         input_ids = torch.tensor(
@@ -152,14 +171,61 @@ class DisamBert(PreTrainedModel):
     def extend_to_max_length(
         self, sentence: torch.Tensor, length: int, maxlength: int
     ) -> torch.Tensor:
-        return (
-            torch.cat(
                 [
-                    sentence,
-                    torch.zeros((self.__entities.shape[0], maxlength - length)),
-                ],
-                dim=1,
             )
-            if length < maxlength
-            else sentence
-        )

 from collections.abc import Generator, Iterable
 from dataclasses import dataclass
 from enum import StrEnum
+from itertools import chain
 import pandas as pd
 import torch
 import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    ModernBertModel,
+    PreTrainedConfig,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import TokenClassifierOutput
 BATCH_SIZE = 64
             self.__entities = None
         else:
             self.BaseModel = ModernBertModel(config)
+            self.classifier_head = nn.Parameter(
+                torch.empty((config.vocab_size, config.hidden_size))
+            )
+            self.__entities = pd.Series(config.entities)
         config.init_basemodel = False
         self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
+        self.loss = nn.CrossEntropyLoss()
         self.post_init()
     @classmethod
             self.__entities = pd.Series(self.config.entities)
         return self.__entities
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        lengths: list[list[int]],
+        candidates: list[list[list[int]]],
+        labels: Iterable[list[int]] | None = None,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+    ) -> TokenClassifierOutput:
         assert not nn.parameter.is_lazy(self.classifier_head), (
             "Run init_classifier to initialise weights"
         )
+        base_model_output = self.BaseModel(
+            input_ids,
+            attention_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        token_vectors = base_model_output.last_hidden_state
+        span_vectors = torch.cat(
+            [
+                torch.vstack(
+                    [
+                        torch.sum(chunk, dim=0)
+                        for chunk in self.split(token_vectors[i], sentence_indices)
+                    ]
+                )
+                for (i, sentence_indices) in enumerate(lengths)
+            ]
+        )
+        logits = torch.einsum("ij,kj->ki", span_vectors, self.classifier_head)
+        logits1 = logits - logits.min()
+        mask = torch.zeros_like(logits)
+        for (i,concepts) in enumerate(chain.from_iterable(candidates)):
+            mask[concepts,i] = torch.tensor(1.0)
+        logits2 = logits1 * mask
+        sentence_lengths = [len(sentence_indices) for sentence_indices in lengths]
+        maxlen = max(sentence_lengths)
+        split_logits = torch.split(logits2, sentence_lengths, dim=1)
+        logits3 = torch.stack(
+            [
+                self.extend_to_max_length(sentence, length, maxlen)
+                for (sentence, length) in zip(split_logits, sentence_lengths, strict=True)
+            ]
+        )
+        return TokenClassifierOutput(
+            logits=logits3,
+            loss=self.loss(logits3, labels) if labels is not None else None,
+            hidden_states=base_model_output.hidden_states if output_hidden_states else None,
+            attentions=base_model_output.attentions if output_attentions else None,
+        )
     def split(self, vectors: torch.Tensor, lengths: list[int]) -> tuple[torch.Tensor, ...]:
         maxlen = vectors.shape[0]
         chunks = vectors.split((lengths + [maxlen - total_length]) if is_padded else lengths)
         return chunks[:-1] if is_padded else chunks
+    def pad(self, tokens: Iterable[list[int]]) -> PaddedBatch:
         lengths = [len(sentence) for sentence in tokens]
         maxlen = max(lengths)
         input_ids = torch.tensor(
     def extend_to_max_length(
         self, sentence: torch.Tensor, length: int, maxlength: int
     ) -> torch.Tensor:
+        with self.BaseModel.device:
+            return (
+                torch.cat(
+                    [
+                        sentence,
+                        torch.zeros((self.__entities.shape[0], maxlength - length)),
+                    ],
+                    dim=1,
+                )
+                if length < maxlength
+                else sentence
+            )
+    def pad_labels(self, labels: list[list[int]]) -> torch.Tensor:
+        unk = len(self.config.entities) - 1
+        lengths = [len(seq) for seq in labels]
+        maxlen = max(lengths)
+        with self.BaseModel.device:
+            return torch.tensor(
                 [
+                    seq + [unk] * (maxlen - length)
+                    for (seq, length) in zip(labels, lengths, strict=True)
+                ]
             )
+    def tokenize(
+        self, batch: list[dict[str, str | list[int]]]
+    ) -> dict[str, torch.Tensor | list[list[int]]]:
+        all_indices = []
+        all_tokens = []
+        with self.BaseModel.device:
+            for example in batch:
+                text = example["text"]
+                span_indices = example["indices"]
+                indices = []
+                tokens = []
+                last_span = len(span_indices) - 2
+                for i, position in enumerate(span_indices[:-1]):
+                    span = text[position : span_indices[i + 1]]
+                    span_tokens = self.tokenizer([span], padding=False)["input_ids"][0]
+                    if i > 0:
+                        span_tokens = span_tokens[1:]
+                    if i < last_span:
+                        span_tokens = span_tokens[:-1]
+                    indices.append(len(span_tokens))
+                    tokens.extend(span_tokens)
+                all_indices.append(indices)
+                all_tokens.append(tokens)
+            padded = self.pad(all_tokens)
+            result = {
+                "input_ids": padded.input_ids,
+                "attention_mask": padded.attention_mask,
+                "lengths": all_indices,
+                "candidates": [example['candidates'] for example in batch]
+            }
+            if "labels" in batch[0]:
+                result["labels"] = self.pad_labels([example["labels"] for example in batch])
+            return result

config.json CHANGED Viewed

@@ -117741,5 +117741,6 @@
   "tie_word_embeddings": true,
   "tokenizer_path": "answerdotai/ModernBERT-base",
   "transformers_version": "5.0.0",
   "vocab_size": 117660
 }

   "tie_word_embeddings": true,
   "tokenizer_path": "answerdotai/ModernBERT-base",
   "transformers_version": "5.0.0",
+  "use_cache": false,
   "vocab_size": 117660
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79d0851573b5002b29d196af74a0b87c06e774b30889fe729bd17f323af7fc2f
 size 957523088

 version https://git-lfs.github.com/spec/v1
+oid sha256:50c403c889a37e9ed106f0912eafe6e97fd2e9bffff26a34d9af7b284643657e
 size 957523088