End of training

Browse files

Files changed (5) hide show

DisamBertSingleSense.py +17 -13
README.md +17 -37
config.json +2 -0
model.safetensors +2 -2
training_args.bin +1 -1

DisamBertSingleSense.py CHANGED Viewed

@@ -43,21 +43,18 @@ class DisamBertSingleSense(PreTrainedModel):
             self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
             self.config.vocab_size += 2
             self.BaseModel.resize_token_embeddings(self.config.vocab_size)
-            self.classifier_projection = nn.UninitializedParameter()
             self.classifier_head = nn.UninitializedParameter()
             self.bias = nn.UninitializedParameter()
             self.__entities = None
         else:
             self.BaseModel = ModernBertModel(config)
-            self.classifier_projection = nn.Parameter(
-                torch.empty((256,config.hidden_size)))
             self.classifier_head = nn.Parameter(
-                torch.empty((config.ontology_size, 256))
             )
-            self.bias = nn.Parameter(torch.empty((1,config.ontology_size)))
             self.__entities = pd.Series(config.entities)
         config.init_basemodel = False
-        self.activation = nn.Tanhshrink()
         self.loss = nn.CrossEntropyLoss()
         self.post_init()
@@ -74,6 +71,9 @@ class DisamBertSingleSense(PreTrainedModel):
         vectors = []
         batch = []
         n = 0
         with self.BaseModel.device:
             torch.cuda.empty_cache()
             for entity in entities:
@@ -95,12 +95,10 @@ class DisamBertSingleSense(PreTrainedModel):
             self.__entities = pd.Series(entity_ids)
             self.config.entities = entity_ids
             self.config.ontology_size = len(entity_ids)
-            (U,S,Vh) = torch.linalg.svd(torch.cat(vectors, dim=0),False)
-            self.classifier_head = nn.Parameter(U[:,:256])
-            self.classifier_projection = nn.Parameter(Vh[:256])
             self.bias = nn.Parameter(
                 torch.nn.init.normal_(
-                    torch.empty((1,self.config.ontology_size)),
                     std=self.classifier_head.std().item() * np.sqrt(self.config.hidden_size),
                 )
             )
@@ -128,9 +126,15 @@ class DisamBertSingleSense(PreTrainedModel):
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
         )
-        token_vectors = base_model_output.last_hidden_state[:, 0]
-        projection = self.activation(torch.einsum('ij,kj->ik',token_vectors,self.classifier_projection))
-        logits = torch.einsum("ij,kj->ik", projection, self.classifier_head) + self.bias
         return TokenClassifierOutput(
             logits=logits,

             self.BaseModel = AutoModel.from_pretrained(config.name_or_path, device_map="auto")
             self.config.vocab_size += 2
             self.BaseModel.resize_token_embeddings(self.config.vocab_size)
             self.classifier_head = nn.UninitializedParameter()
             self.bias = nn.UninitializedParameter()
             self.__entities = None
         else:
             self.BaseModel = ModernBertModel(config)
             self.classifier_head = nn.Parameter(
+                torch.empty((config.ontology_size, config.hidden_size))
             )
+            self.bias = nn.Parameter(torch.empty((1, config.ontology_size)))
             self.__entities = pd.Series(config.entities)
         config.init_basemodel = False
         self.loss = nn.CrossEntropyLoss()
         self.post_init()
         vectors = []
         batch = []
         n = 0
+        special_tokens = tokenizer.get_added_vocab()
+        self.config.start_token = special_tokens['[START]']
+        self.config.end_token = special_tokens['[END]']
         with self.BaseModel.device:
             torch.cuda.empty_cache()
             for entity in entities:
             self.__entities = pd.Series(entity_ids)
             self.config.entities = entity_ids
             self.config.ontology_size = len(entity_ids)
+            self.classifier_head = nn.Parameter(torch.cat(vectors, dim=0))
             self.bias = nn.Parameter(
                 torch.nn.init.normal_(
+                    torch.empty((1, self.config.ontology_size)),
                     std=self.classifier_head.std().item() * np.sqrt(self.config.hidden_size),
                 )
             )
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
         )
+        token_vectors = base_model_output.last_hidden_state
+        selection = torch.zeros_like(input_ids,dtype=token_vectors.dtype)
+        starts = (input_ids==self.config.start_token).nonzero()
+        ends = (input_ids==self.config.end_token).nonzero()
+        for (startpos,endpos) in zip(starts,ends,strict=True):
+            selection[startpos[0],startpos[1]:endpos[1]+1]=1.0
+        selection[:,0] = 1.0
+        entity_vectors = torch.einsum('ijk,ij->ik',token_vectors,selection)
+        logits = torch.einsum("ij,kj->ik", entity_vectors, self.classifier_head) + self.bias
         return TokenClassifierOutput(
             logits=logits,

README.md CHANGED Viewed

@@ -22,11 +22,11 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
 It achieves the following results on the evaluation set:
-- Loss: 4.9159
-- Precision: 0.6058
-- Recall: 0.6152
-- F1: 0.6105
-- Matthews: 0.6150
 ## Model description
@@ -52,43 +52,23 @@ The following hyperparameters were used during training:
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: inverse_sqrt
 - lr_scheduler_warmup_steps: 1000
-- num_epochs: 30
 ### Training results
 | Training Loss | Epoch | Step   | Validation Loss | Precision | Recall | F1     | Matthews |
 |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
-| No log        | 0     | 0      | 11.6611         | 0.0       | 0.0    | 0.0    | -0.0000  |
-| 2.5218        | 1.0   | 14014  | 4.1247          | 0.5003    | 0.5245 | 0.5121 | 0.5243   |
-| 1.7184        | 2.0   | 28028  | 3.8822          | 0.5656    | 0.5727 | 0.5692 | 0.5726   |
-| 1.2533        | 3.0   | 42042  | 3.9284          | 0.5859    | 0.5907 | 0.5883 | 0.5905   |
-| 0.9708        | 4.0   | 56056  | 4.0396          | 0.5868    | 0.5907 | 0.5888 | 0.5905   |
-| 0.7932        | 5.0   | 70070  | 4.1447          | 0.5899    | 0.5968 | 0.5934 | 0.5966   |
-| 0.6030        | 6.0   | 84084  | 4.1830          | 0.5932    | 0.6017 | 0.5974 | 0.6014   |
-| 0.5155        | 7.0   | 98098  | 4.2383          | 0.6065    | 0.6082 | 0.6074 | 0.6080   |
-| 0.4701        | 8.0   | 112112 | 4.2015          | 0.6014    | 0.6122 | 0.6068 | 0.6120   |
-| 0.4166        | 9.0   | 126126 | 4.2186          | 0.6096    | 0.6131 | 0.6113 | 0.6128   |
-| 0.3191        | 10.0  | 140140 | 4.3041          | 0.6076    | 0.6096 | 0.6086 | 0.6093   |
-| 0.2979        | 11.0  | 154154 | 4.3275          | 0.6082    | 0.6104 | 0.6093 | 0.6102   |
-| 0.2633        | 12.0  | 168168 | 4.3902          | 0.6171    | 0.6209 | 0.6190 | 0.6207   |
-| 0.2061        | 13.0  | 182182 | 4.4546          | 0.6141    | 0.6196 | 0.6168 | 0.6194   |
-| 0.1829        | 14.0  | 196196 | 4.3960          | 0.6134    | 0.6161 | 0.6147 | 0.6159   |
-| 0.1793        | 15.0  | 210210 | 4.4565          | 0.6151    | 0.6196 | 0.6174 | 0.6194   |
-| 0.1473        | 16.0  | 224224 | 4.4976          | 0.6165    | 0.6218 | 0.6192 | 0.6216   |
-| 0.1631        | 17.0  | 238238 | 4.4916          | 0.6113    | 0.6179 | 0.6146 | 0.6177   |
-| 0.1679        | 18.0  | 252252 | 4.5221          | 0.6114    | 0.6161 | 0.6137 | 0.6159   |
-| 0.1567        | 19.0  | 266266 | 4.5560          | 0.6057    | 0.6166 | 0.6111 | 0.6164   |
-| 0.1670        | 20.0  | 280280 | 4.6266          | 0.6127    | 0.6179 | 0.6153 | 0.6177   |
-| 0.1817        | 21.0  | 294294 | 4.5746          | 0.6117    | 0.6196 | 0.6157 | 0.6194   |
-| 0.1752        | 22.0  | 308308 | 4.6536          | 0.6131    | 0.6192 | 0.6161 | 0.6190   |
-| 0.2083        | 23.0  | 322322 | 4.7661          | 0.6108    | 0.6192 | 0.6150 | 0.6190   |
-| 0.1764        | 24.0  | 336336 | 4.7735          | 0.6105    | 0.6170 | 0.6137 | 0.6168   |
-| 0.2072        | 25.0  | 350350 | 4.8155          | 0.6076    | 0.6157 | 0.6116 | 0.6155   |
-| 0.1668        | 26.0  | 364364 | 4.7572          | 0.6025    | 0.6109 | 0.6067 | 0.6107   |
-| 0.2046        | 27.0  | 378378 | 4.8226          | 0.6028    | 0.6113 | 0.6070 | 0.6111   |
-| 0.2653        | 28.0  | 392392 | 4.8000          | 0.6032    | 0.6166 | 0.6098 | 0.6163   |
-| 0.3166        | 29.0  | 406406 | 4.8968          | 0.6062    | 0.6174 | 0.6118 | 0.6172   |
-| 0.3265        | 30.0  | 420420 | 4.9159          | 0.6058    | 0.6152 | 0.6105 | 0.6150   |
 ### Framework versions

 This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
 It achieves the following results on the evaluation set:
+- Loss: 10.0010
+- Precision: 0.6717
+- Recall: 0.6486
+- F1: 0.6599
+- Matthews: 0.6479
 ## Model description
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: inverse_sqrt
 - lr_scheduler_warmup_steps: 1000
+- num_epochs: 10
 ### Training results
 | Training Loss | Epoch | Step   | Validation Loss | Precision | Recall | F1     | Matthews |
 |:-------------:|:-----:|:------:|:---------------:|:---------:|:------:|:------:|:--------:|
+| No log        | 0     | 0      | 641.2748        | 0.0       | 0.0    | 0.0    | -0.0000  |
+| 4.9398        | 1.0   | 14014  | 7.1390          | 0.5863    | 0.5649 | 0.5754 | 0.5641   |
+| 1.9762        | 2.0   | 28028  | 6.1541          | 0.6409    | 0.6117 | 0.6260 | 0.6110   |
+| 1.1673        | 3.0   | 42042  | 6.2676          | 0.6534    | 0.6328 | 0.6429 | 0.6321   |
+| 0.4893        | 4.0   | 56056  | 6.9641          | 0.6609    | 0.6394 | 0.6499 | 0.6387   |
+| 0.2413        | 5.0   | 70070  | 7.8858          | 0.6637    | 0.6363 | 0.6497 | 0.6356   |
+| 0.1245        | 6.0   | 84084  | 8.9750          | 0.6662    | 0.6310 | 0.6481 | 0.6304   |
+| 0.0557        | 7.0   | 98098  | 9.4948          | 0.6693    | 0.6398 | 0.6542 | 0.6391   |
+| 0.0451        | 8.0   | 112112 | 9.7435          | 0.6682    | 0.6402 | 0.6539 | 0.6395   |
+| 0.0359        | 9.0   | 126126 | 9.9980          | 0.6676    | 0.6306 | 0.6486 | 0.6299   |
+| 0.0188        | 10.0  | 140140 | 10.0010         | 0.6717    | 0.6486 | 0.6599 | 0.6479   |
 ### Framework versions

config.json CHANGED Viewed

@@ -17,6 +17,7 @@
   "deterministic_flash_attn": false,
   "dtype": "float32",
   "embedding_dropout": 0.0,
   "entities": [
     "able.a.01",
     "unable.a.01",
@@ -117737,6 +117738,7 @@
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
   "tie_word_embeddings": true,
   "transformers_version": "5.2.0",
   "use_cache": false,

   "deterministic_flash_attn": false,
   "dtype": "float32",
   "embedding_dropout": 0.0,
+  "end_token": 50369,
   "entities": [
     "able.a.01",
     "unable.a.01",
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
+  "start_token": 50368,
   "tie_word_embeddings": true,
   "transformers_version": "5.2.0",
   "use_cache": false,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb2f14abb4480bbe1187b576b4cb231464599407287fe0263c7c64640fb24f65
-size 717817772

 version https://git-lfs.github.com/spec/v1
+oid sha256:8a51dac68b3405593343a667569adf0b33a56734d8818c585b478c96647e8171
+size 957996876

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0cbe6643d4a9a9d097d7d190319dbfe4cdc9057b9380ab52337a02f9c1143eb1
 size 4856

 version https://git-lfs.github.com/spec/v1
+oid sha256:290a6dde229c724e565072da4f33d9559a54b559464d187b49178893aa79cbc3
 size 4856