PeteBleackley commited on
Commit
9f5c6cd
·
verified ·
1 Parent(s): fbbff43

End of training

Browse files
Files changed (5) hide show
  1. DisamBertSingleSense.py +78 -65
  2. README.md +18 -18
  3. config.json +0 -0
  4. model.safetensors +2 -2
  5. training_args.bin +1 -1
DisamBertSingleSense.py CHANGED
@@ -2,13 +2,14 @@ from collections.abc import Generator, Iterable
2
  from dataclasses import dataclass
3
  from enum import StrEnum
4
 
5
- import numpy as np
6
- import pandas as pd
7
  import torch
8
  import torch.nn as nn
9
  from transformers import (
10
  AutoConfig,
11
  AutoModel,
 
12
  ModernBertModel,
13
  PreTrainedConfig,
14
  PreTrainedModel,
@@ -43,16 +44,8 @@ class DisamBertSingleSense(PreTrainedModel):
43
  self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
44
  self.config.vocab_size += 2
45
  self.BaseModel.resize_token_embeddings(self.config.vocab_size)
46
- self.classifier_head = nn.UninitializedParameter()
47
- self.bias = nn.UninitializedParameter()
48
- self.__entities = None
49
  else:
50
  self.BaseModel = ModernBertModel(config)
51
- self.classifier_head = nn.Parameter(
52
- torch.empty((config.ontology_size, config.hidden_size))
53
- )
54
- self.bias = nn.Parameter(torch.empty((1, config.ontology_size)))
55
- self.__entities = pd.Series(config.entities)
56
  config.init_basemodel = False
57
 
58
  self.loss = nn.CrossEntropyLoss()
@@ -64,62 +57,21 @@ class DisamBertSingleSense(PreTrainedModel):
64
  config.init_basemodel = True
65
  return cls(config)
66
 
67
- def init_classifier(
68
- self, entities: Generator[LexicalExample], tokenizer: PreTrainedTokenizer
69
- ) -> None:
70
- entity_ids = []
71
- vectors = []
72
- batch = []
73
- n = 0
74
- special_tokens = tokenizer.get_added_vocab()
75
- self.config.start_token = special_tokens['[START]']
76
- self.config.end_token = special_tokens['[END]']
77
- with self.BaseModel.device:
78
- torch.cuda.empty_cache()
79
- for entity in entities:
80
- entity_ids.append(entity.concept)
81
- batch.append(entity.definition)
82
-
83
- n += 1
84
- if n == BATCH_SIZE:
85
- tokens = tokenizer(batch, padding=True, return_tensors="pt")
86
- encoding = self.BaseModel(tokens["input_ids"], tokens["attention_mask"])
87
- vectors.append(encoding.last_hidden_state.detach()[:, 0])
88
- n = 0
89
- batch = []
90
- if n > 0:
91
- tokens = tokenizer(batch, padding=True, return_tensors="pt")
92
- encoding = self.BaseModel(tokens["input_ids"], tokens["attention_mask"])
93
- vectors.append(encoding.last_hidden_state.detach()[:, 0])
94
-
95
- self.__entities = pd.Series(entity_ids)
96
- self.config.entities = entity_ids
97
- self.config.ontology_size = len(entity_ids)
98
- self.classifier_head = nn.Parameter(torch.cat(vectors, dim=0))
99
- self.bias = nn.Parameter(
100
- torch.nn.init.normal_(
101
- torch.empty((1, self.config.ontology_size)),
102
- std=self.classifier_head.std().item() * np.sqrt(self.config.hidden_size),
103
- )
104
- )
105
-
106
- @property
107
- def entities(self) -> pd.Series:
108
- if self.__entities is None and hasattr(self.config, "entities"):
109
- self.__entities = pd.Series(self.config.entities)
110
- return self.__entities
111
 
112
  def forward(
113
  self,
114
  input_ids: torch.Tensor,
115
  attention_mask: torch.Tensor,
 
 
 
116
  labels: Iterable[int] | None = None,
117
  output_hidden_states: bool = False,
118
  output_attentions: bool = False,
119
  ) -> TokenClassifierOutput:
120
- assert not nn.parameter.is_lazy(self.classifier_head), (
121
- "Run init_classifier to initialise weights"
122
- )
123
  base_model_output = self.BaseModel(
124
  input_ids,
125
  attention_mask,
@@ -127,14 +79,16 @@ class DisamBertSingleSense(PreTrainedModel):
127
  output_attentions=output_attentions,
128
  )
129
  token_vectors = base_model_output.last_hidden_state
130
- selection = torch.zeros_like(input_ids,dtype=token_vectors.dtype)
131
- starts = (input_ids==self.config.start_token).nonzero()
132
- ends = (input_ids==self.config.end_token).nonzero()
133
- for (startpos,endpos) in zip(starts,ends,strict=True):
134
- selection[startpos[0],startpos[1]:endpos[1]+1]=1.0
135
- selection[:,0] = 1.0
136
- entity_vectors = torch.einsum('ijk,ij->ik',token_vectors,selection)
137
- logits = torch.einsum("ij,kj->ik", entity_vectors, self.classifier_head) + self.bias
 
 
138
 
139
  return TokenClassifierOutput(
140
  logits=logits,
@@ -142,3 +96,62 @@ class DisamBertSingleSense(PreTrainedModel):
142
  hidden_states=base_model_output.hidden_states if output_hidden_states else None,
143
  attentions=base_model_output.attentions if output_attentions else None,
144
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from dataclasses import dataclass
3
  from enum import StrEnum
4
 
5
+ import pprint
6
+
7
  import torch
8
  import torch.nn as nn
9
  from transformers import (
10
  AutoConfig,
11
  AutoModel,
12
+ BatchEncoding,
13
  ModernBertModel,
14
  PreTrainedConfig,
15
  PreTrainedModel,
 
44
  self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
45
  self.config.vocab_size += 2
46
  self.BaseModel.resize_token_embeddings(self.config.vocab_size)
 
 
 
47
  else:
48
  self.BaseModel = ModernBertModel(config)
 
 
 
 
 
49
  config.init_basemodel = False
50
 
51
  self.loss = nn.CrossEntropyLoss()
 
57
  config.init_basemodel = True
58
  return cls(config)
59
 
60
+ def add_special_tokens(self, start: int, end: int):
61
+ self.config.start_token = start
62
+ self.config.end_token = end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def forward(
65
  self,
66
  input_ids: torch.Tensor,
67
  attention_mask: torch.Tensor,
68
+ candidate_tokens: torch.Tensor,
69
+ candidate_attention_masks: torch.Tensor,
70
+ candidate_mapping: torch.Tensor,
71
  labels: Iterable[int] | None = None,
72
  output_hidden_states: bool = False,
73
  output_attentions: bool = False,
74
  ) -> TokenClassifierOutput:
 
 
 
75
  base_model_output = self.BaseModel(
76
  input_ids,
77
  attention_mask,
 
79
  output_attentions=output_attentions,
80
  )
81
  token_vectors = base_model_output.last_hidden_state
82
+ selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
83
+ starts = (input_ids == self.config.start_token).nonzero()
84
+ ends = (input_ids == self.config.end_token).nonzero()
85
+ for startpos, endpos in zip(starts, ends, strict=True):
86
+ selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
87
+ entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
88
+ gloss_vectors = self.gloss_vectors(
89
+ candidate_tokens, candidate_attention_masks, candidate_mapping
90
+ )
91
+ logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
92
 
93
  return TokenClassifierOutput(
94
  logits=logits,
 
96
  hidden_states=base_model_output.hidden_states if output_hidden_states else None,
97
  attentions=base_model_output.attentions if output_attentions else None,
98
  )
99
+
100
+ def gloss_vectors(self, candidates, candidate_attention_masks, candidate_mapping):
101
+ with self.device:
102
+ vectors = self.BaseModel(candidates, candidate_attention_masks).last_hidden_state[:, 0]
103
+ chunks = [
104
+ torch.squeeze(vectors[(candidate_mapping == sentence_index).nonzero()],
105
+ dim=1)
106
+ for sentence_index in torch.unique(candidate_mapping)
107
+ ]
108
+ maxlen = max(chunk.shape[0] for chunk in chunks)
109
+ return torch.stack(
110
+ [
111
+ torch.cat([chunk, torch.zeros((maxlen - chunk.shape[0], self.config.hidden_size))])
112
+ for chunk in chunks
113
+ ]
114
+ )
115
+
116
+
117
+ class CandidateLabeller:
118
+ def __init__(self, tokenizer: PreTrainedTokenizer, ontology: Generator[LexicalExample], device:torch.device):
119
+ self.tokenizer = tokenizer
120
+ self.device = device
121
+ self.gloss_tokens = {
122
+ example.concept: self.tokenizer(example.definition, padding=True)
123
+ for example in ontology
124
+ }
125
+
126
+ def __call__(self, batch: dict) -> dict:
127
+ with self.device:
128
+ encoded = [
129
+ BatchEncoding(
130
+ {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
131
+ )
132
+ for example in batch
133
+ ]
134
+ tokens = self.tokenizer.pad(encoded, padding=True, return_tensors="pt")
135
+ candidate_tokens = self.tokenizer.pad(
136
+ [self.gloss_tokens[concept] for example in batch for concept in example["candidates"]],
137
+ padding=True,
138
+ return_attention_mask=True,
139
+ return_tensors="pt",
140
+ )
141
+ result = {
142
+ "input_ids": tokens.input_ids,
143
+ "attention_mask": tokens.attention_mask,
144
+ "candidate_tokens": candidate_tokens.input_ids,
145
+ "candidate_attention_masks": candidate_tokens.attention_mask,
146
+ "candidate_mapping": torch.cat(
147
+ [
148
+ torch.tensor([i] * len(example["candidates"]))
149
+ for (i, example) in enumerate(batch)
150
+ ]
151
+ ),
152
+ }
153
+ if "label" in batch[0]:
154
+ result["labels"] = torch.tensor(
155
+ [example["candidates"].index(example["label"]) for example in batch]
156
+ )
157
+ return result
README.md CHANGED
@@ -22,11 +22,11 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 10.0010
26
- - Precision: 0.6717
27
- - Recall: 0.6486
28
- - F1: 0.6599
29
- - Matthews: 0.6479
30
 
31
  ## Model description
32
 
@@ -46,8 +46,8 @@ More information needed
46
 
47
  The following hyperparameters were used during training:
48
  - learning_rate: 0.0001
49
- - train_batch_size: 16
50
- - eval_batch_size: 16
51
  - seed: 42
52
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
  - lr_scheduler_type: inverse_sqrt
@@ -58,17 +58,17 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 | Matthews |
60
  |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
61
- | No log | 0 | 0 | 641.2748 | 0.0 | 0.0 | 0.0 | -0.0000 |
62
- | 4.9398 | 1.0 | 14014 | 7.1390 | 0.5863 | 0.5649 | 0.5754 | 0.5641 |
63
- | 1.9762 | 2.0 | 28028 | 6.1541 | 0.6409 | 0.6117 | 0.6260 | 0.6110 |
64
- | 1.1673 | 3.0 | 42042 | 6.2676 | 0.6534 | 0.6328 | 0.6429 | 0.6321 |
65
- | 0.4893 | 4.0 | 56056 | 6.9641 | 0.6609 | 0.6394 | 0.6499 | 0.6387 |
66
- | 0.2413 | 5.0 | 70070 | 7.8858 | 0.6637 | 0.6363 | 0.6497 | 0.6356 |
67
- | 0.1245 | 6.0 | 84084 | 8.9750 | 0.6662 | 0.6310 | 0.6481 | 0.6304 |
68
- | 0.0557 | 7.0 | 98098 | 9.4948 | 0.6693 | 0.6398 | 0.6542 | 0.6391 |
69
- | 0.0451 | 8.0 | 112112 | 9.7435 | 0.6682 | 0.6402 | 0.6539 | 0.6395 |
70
- | 0.0359 | 9.0 | 126126 | 9.9980 | 0.6676 | 0.6306 | 0.6486 | 0.6299 |
71
- | 0.0188 | 10.0 | 140140 | 10.0010 | 0.6717 | 0.6486 | 0.6599 | 0.6479 |
72
 
73
 
74
  ### Framework versions
 
22
 
23
  This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
24
  It achieves the following results on the evaluation set:
25
+ - Loss: 7.8247
26
+ - Precision: 0.7569
27
+ - Recall: 0.7432
28
+ - F1: 0.7500
29
+ - Matthews: 0.7427
30
 
31
  ## Model description
32
 
 
46
 
47
  The following hyperparameters were used during training:
48
  - learning_rate: 0.0001
49
+ - train_batch_size: 8
50
+ - eval_batch_size: 8
51
  - seed: 42
52
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
  - lr_scheduler_type: inverse_sqrt
 
58
 
59
  | Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 | Matthews |
60
  |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
61
+ | No log | 0 | 0 | 81.7936 | 0.4396 | 0.3681 | 0.4007 | 0.3673 |
62
+ | 0.5564 | 1.0 | 28027 | 0.8047 | 0.7521 | 0.7485 | 0.7503 | 0.7480 |
63
+ | 0.4256 | 2.0 | 56054 | 1.0294 | 0.7659 | 0.7590 | 0.7624 | 0.7585 |
64
+ | 0.2639 | 3.0 | 84081 | 1.6682 | 0.7656 | 0.7480 | 0.7567 | 0.7475 |
65
+ | 0.1907 | 4.0 | 112108 | 3.4982 | 0.7703 | 0.7498 | 0.7599 | 0.7493 |
66
+ | 0.0368 | 5.0 | 140135 | 5.1443 | 0.7635 | 0.7458 | 0.7546 | 0.7453 |
67
+ | 0.0382 | 6.0 | 168162 | 6.3556 | 0.7674 | 0.7463 | 0.7567 | 0.7458 |
68
+ | 0.0172 | 7.0 | 196189 | 8.0398 | 0.7548 | 0.7410 | 0.7479 | 0.7405 |
69
+ | 0.0172 | 8.0 | 224216 | 7.1042 | 0.7605 | 0.7467 | 0.7536 | 0.7462 |
70
+ | 0.0113 | 9.0 | 252243 | 7.6688 | 0.7624 | 0.7467 | 0.7545 | 0.7462 |
71
+ | 0.0064 | 10.0 | 280270 | 7.8247 | 0.7569 | 0.7432 | 0.7500 | 0.7427 |
72
 
73
 
74
  ### Framework versions
config.json CHANGED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a51dac68b3405593343a667569adf0b33a56734d8818c585b478c96647e8171
3
- size 957996876
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16fa38968a9a12b8f7abd761f6134a5a79193c9984529af17ec8f2117dfc7050
3
+ size 596077624
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:290a6dde229c724e565072da4f33d9559a54b559464d187b49178893aa79cbc3
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b648b291efd56f0128f34fe729eaf985ba8d68028678fbbb6e87384cb7e662
3
  size 4856