End of training

Browse files

Files changed (5) hide show

DisamBertSingleSense.py +78 -65
README.md +18 -18
config.json +0 -0
model.safetensors +2 -2
training_args.bin +1 -1

DisamBertSingleSense.py CHANGED Viewed

@@ -2,13 +2,14 @@ from collections.abc import Generator, Iterable
 from dataclasses import dataclass
 from enum import StrEnum
-import numpy as np
-import pandas as pd
 import torch
 import torch.nn as nn
 from transformers import (
     AutoConfig,
     AutoModel,
     ModernBertModel,
     PreTrainedConfig,
     PreTrainedModel,
@@ -43,16 +44,8 @@ class DisamBertSingleSense(PreTrainedModel):
             self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
             self.config.vocab_size += 2
             self.BaseModel.resize_token_embeddings(self.config.vocab_size)
-            self.classifier_head = nn.UninitializedParameter()
-            self.bias = nn.UninitializedParameter()
-            self.__entities = None
         else:
             self.BaseModel = ModernBertModel(config)
-            self.classifier_head = nn.Parameter(
-                torch.empty((config.ontology_size, config.hidden_size))
-            )
-            self.bias = nn.Parameter(torch.empty((1, config.ontology_size)))
-            self.__entities = pd.Series(config.entities)
         config.init_basemodel = False
         self.loss = nn.CrossEntropyLoss()
@@ -64,62 +57,21 @@ class DisamBertSingleSense(PreTrainedModel):
         config.init_basemodel = True
         return cls(config)
-    def init_classifier(
-        self, entities: Generator[LexicalExample], tokenizer: PreTrainedTokenizer
-    ) -> None:
-        entity_ids = []
-        vectors = []
-        batch = []
-        n = 0
-        special_tokens = tokenizer.get_added_vocab()
-        self.config.start_token = special_tokens['[START]']
-        self.config.end_token = special_tokens['[END]']
-        with self.BaseModel.device:
-            torch.cuda.empty_cache()
-            for entity in entities:
-                entity_ids.append(entity.concept)
-                batch.append(entity.definition)
-                n += 1
-                if n == BATCH_SIZE:
-                    tokens = tokenizer(batch, padding=True, return_tensors="pt")
-                    encoding = self.BaseModel(tokens["input_ids"], tokens["attention_mask"])
-                    vectors.append(encoding.last_hidden_state.detach()[:, 0])
-                    n = 0
-                    batch = []
-            if n > 0:
-                tokens = tokenizer(batch, padding=True, return_tensors="pt")
-                encoding = self.BaseModel(tokens["input_ids"], tokens["attention_mask"])
-                vectors.append(encoding.last_hidden_state.detach()[:, 0])
-            self.__entities = pd.Series(entity_ids)
-            self.config.entities = entity_ids
-            self.config.ontology_size = len(entity_ids)
-            self.classifier_head = nn.Parameter(torch.cat(vectors, dim=0))
-            self.bias = nn.Parameter(
-                torch.nn.init.normal_(
-                    torch.empty((1, self.config.ontology_size)),
-                    std=self.classifier_head.std().item() * np.sqrt(self.config.hidden_size),
-                )
-            )
-    @property
-    def entities(self) -> pd.Series:
-        if self.__entities is None and hasattr(self.config, "entities"):
-            self.__entities = pd.Series(self.config.entities)
-        return self.__entities
     def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         labels: Iterable[int] | None = None,
         output_hidden_states: bool = False,
         output_attentions: bool = False,
     ) -> TokenClassifierOutput:
-        assert not nn.parameter.is_lazy(self.classifier_head), (
-            "Run init_classifier to initialise weights"
-        )
         base_model_output = self.BaseModel(
             input_ids,
             attention_mask,
@@ -127,14 +79,16 @@ class DisamBertSingleSense(PreTrainedModel):
             output_attentions=output_attentions,
         )
         token_vectors = base_model_output.last_hidden_state
-        selection = torch.zeros_like(input_ids,dtype=token_vectors.dtype)
-        starts = (input_ids==self.config.start_token).nonzero()
-        ends = (input_ids==self.config.end_token).nonzero()
-        for (startpos,endpos) in zip(starts,ends,strict=True):
-            selection[startpos[0],startpos[1]:endpos[1]+1]=1.0
-        selection[:,0] = 1.0
-        entity_vectors = torch.einsum('ijk,ij->ik',token_vectors,selection)
-        logits = torch.einsum("ij,kj->ik", entity_vectors, self.classifier_head) + self.bias
         return TokenClassifierOutput(
             logits=logits,
@@ -142,3 +96,62 @@ class DisamBertSingleSense(PreTrainedModel):
             hidden_states=base_model_output.hidden_states if output_hidden_states else None,
             attentions=base_model_output.attentions if output_attentions else None,
         )

 from dataclasses import dataclass
 from enum import StrEnum
+import pprint
 import torch
 import torch.nn as nn
 from transformers import (
     AutoConfig,
     AutoModel,
+    BatchEncoding,
     ModernBertModel,
     PreTrainedConfig,
     PreTrainedModel,
             self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
             self.config.vocab_size += 2
             self.BaseModel.resize_token_embeddings(self.config.vocab_size)
         else:
             self.BaseModel = ModernBertModel(config)
         config.init_basemodel = False
         self.loss = nn.CrossEntropyLoss()
         config.init_basemodel = True
         return cls(config)
+    def add_special_tokens(self, start: int, end: int):
+        self.config.start_token = start
+        self.config.end_token = end
     def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
+        candidate_tokens: torch.Tensor,
+        candidate_attention_masks: torch.Tensor,
+        candidate_mapping: torch.Tensor,
         labels: Iterable[int] | None = None,
         output_hidden_states: bool = False,
         output_attentions: bool = False,
     ) -> TokenClassifierOutput:
         base_model_output = self.BaseModel(
             input_ids,
             attention_mask,
             output_attentions=output_attentions,
         )
         token_vectors = base_model_output.last_hidden_state
+        selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
+        starts = (input_ids == self.config.start_token).nonzero()
+        ends = (input_ids == self.config.end_token).nonzero()
+        for startpos, endpos in zip(starts, ends, strict=True):
+            selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
+        entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
+        gloss_vectors = self.gloss_vectors(
+            candidate_tokens, candidate_attention_masks, candidate_mapping
+        )
+        logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
         return TokenClassifierOutput(
             logits=logits,
             hidden_states=base_model_output.hidden_states if output_hidden_states else None,
             attentions=base_model_output.attentions if output_attentions else None,
         )
+    def gloss_vectors(self, candidates, candidate_attention_masks, candidate_mapping):
+        with self.device:
+            vectors = self.BaseModel(candidates, candidate_attention_masks).last_hidden_state[:, 0]
+            chunks = [
+                torch.squeeze(vectors[(candidate_mapping == sentence_index).nonzero()],
+                              dim=1)
+                for sentence_index in torch.unique(candidate_mapping)
+            ]
+            maxlen = max(chunk.shape[0] for chunk in chunks)
+            return torch.stack(
+                [
+                    torch.cat([chunk, torch.zeros((maxlen - chunk.shape[0], self.config.hidden_size))])
+                    for chunk in chunks
+                ]
+            )
+class CandidateLabeller:
+    def __init__(self, tokenizer: PreTrainedTokenizer, ontology: Generator[LexicalExample], device:torch.device):
+        self.tokenizer = tokenizer
+        self.device = device
+        self.gloss_tokens = {
+            example.concept: self.tokenizer(example.definition, padding=True)
+            for example in ontology
+        }
+    def __call__(self, batch: dict) -> dict:
+        with self.device:
+            encoded = [
+                BatchEncoding(
+                    {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
+                )
+                for example in batch
+            ]
+            tokens = self.tokenizer.pad(encoded, padding=True, return_tensors="pt")
+            candidate_tokens = self.tokenizer.pad(
+                [self.gloss_tokens[concept] for example in batch for concept in example["candidates"]],
+                padding=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            )
+            result = {
+                "input_ids": tokens.input_ids,
+                "attention_mask": tokens.attention_mask,
+                "candidate_tokens": candidate_tokens.input_ids,
+                "candidate_attention_masks": candidate_tokens.attention_mask,
+                "candidate_mapping": torch.cat(
+                    [
+                        torch.tensor([i] * len(example["candidates"]))
+                        for (i, example) in enumerate(batch)
+                    ]
+                ),
+            }
+            if "label" in batch[0]:
+                result["labels"] = torch.tensor(
+                    [example["candidates"].index(example["label"]) for example in batch]
+                )
+            return result

README.md CHANGED Viewed

@@ -22,11 +22,11 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
 It achieves the following results on the evaluation set:
-- Loss: 10.0010
-- Precision: 0.6717
-- Recall: 0.6486
-- F1: 0.6599
-- Matthews: 0.6479
 ## Model description
@@ -46,8 +46,8 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 0.0001
-- train_batch_size: 16
-- eval_batch_size: 16
 - seed: 42
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: inverse_sqrt
@@ -58,17 +58,17 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step   | Validation Loss | Precision | Recall | F1     | Matthews |
 |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
-| No log        | 0     | 0      | 641.2748        | 0.0       | 0.0    | 0.0    | -0.0000  |
-| 4.9398        | 1.0   | 14014  | 7.1390          | 0.5863    | 0.5649 | 0.5754 | 0.5641   |
-| 1.9762        | 2.0   | 28028  | 6.1541          | 0.6409    | 0.6117 | 0.6260 | 0.6110   |
-| 1.1673        | 3.0   | 42042  | 6.2676          | 0.6534    | 0.6328 | 0.6429 | 0.6321   |
-| 0.4893        | 4.0   | 56056  | 6.9641          | 0.6609    | 0.6394 | 0.6499 | 0.6387   |
-| 0.2413        | 5.0   | 70070  | 7.8858          | 0.6637    | 0.6363 | 0.6497 | 0.6356   |
-| 0.1245        | 6.0   | 84084  | 8.9750          | 0.6662    | 0.6310 | 0.6481 | 0.6304   |
-| 0.0557        | 7.0   | 98098  | 9.4948          | 0.6693    | 0.6398 | 0.6542 | 0.6391   |
-| 0.0451        | 8.0   | 112112 | 9.7435          | 0.6682    | 0.6402 | 0.6539 | 0.6395   |
-| 0.0359        | 9.0   | 126126 | 9.9980          | 0.6676    | 0.6306 | 0.6486 | 0.6299   |
-| 0.0188        | 10.0  | 140140 | 10.0010         | 0.6717    | 0.6486 | 0.6599 | 0.6479   |
 ### Framework versions

 This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
 It achieves the following results on the evaluation set:
+- Loss: 7.8247
+- Precision: 0.7569
+- Recall: 0.7432
+- F1: 0.7500
+- Matthews: 0.7427
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 0.0001
+- train_batch_size: 8
+- eval_batch_size: 8
 - seed: 42
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: inverse_sqrt
 | Training Loss | Epoch | Step   | Validation Loss | Precision | Recall | F1     | Matthews |
 |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
+| No log        | 0     | 0      | 81.7936         | 0.4396    | 0.3681 | 0.4007 | 0.3673   |
+| 0.5564        | 1.0   | 28027  | 0.8047          | 0.7521    | 0.7485 | 0.7503 | 0.7480   |
+| 0.4256        | 2.0   | 56054  | 1.0294          | 0.7659    | 0.7590 | 0.7624 | 0.7585   |
+| 0.2639        | 3.0   | 84081  | 1.6682          | 0.7656    | 0.7480 | 0.7567 | 0.7475   |
+| 0.1907        | 4.0   | 112108 | 3.4982          | 0.7703    | 0.7498 | 0.7599 | 0.7493   |
+| 0.0368        | 5.0   | 140135 | 5.1443          | 0.7635    | 0.7458 | 0.7546 | 0.7453   |
+| 0.0382        | 6.0   | 168162 | 6.3556          | 0.7674    | 0.7463 | 0.7567 | 0.7458   |
+| 0.0172        | 7.0   | 196189 | 8.0398          | 0.7548    | 0.7410 | 0.7479 | 0.7405   |
+| 0.0172        | 8.0   | 224216 | 7.1042          | 0.7605    | 0.7467 | 0.7536 | 0.7462   |
+| 0.0113        | 9.0   | 252243 | 7.6688          | 0.7624    | 0.7467 | 0.7545 | 0.7462   |
+| 0.0064        | 10.0  | 280270 | 7.8247          | 0.7569    | 0.7432 | 0.7500 | 0.7427   |
 ### Framework versions

config.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a51dac68b3405593343a667569adf0b33a56734d8818c585b478c96647e8171
-size 957996876

 version https://git-lfs.github.com/spec/v1
+oid sha256:16fa38968a9a12b8f7abd761f6134a5a79193c9984529af17ec8f2117dfc7050
+size 596077624

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:290a6dde229c724e565072da4f33d9559a54b559464d187b49178893aa79cbc3
 size 4856

 version https://git-lfs.github.com/spec/v1
+oid sha256:79b648b291efd56f0128f34fe729eaf985ba8d68028678fbbb6e87384cb7e662
 size 4856