Fix head_mask documentation errors in model classes

Added missing head_mask parameter documentation to:
- BertHashModel.forward
- BertHashForMaskedLM.forward
- BertHashForSequenceClassification.forward

This resolves transformer loading warnings about undocumented head_mask parameter in docstrings.

Files changed (2) hide show

modeling_bert_hash.py +22 -0
test.py +0 -68

modeling_bert_hash.py CHANGED Viewed

@@ -232,6 +232,14 @@ class BertHashModel(BertPreTrainedModel):
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
     ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         output_attentions = (
             output_attentions
             if output_attentions is not None
@@ -432,6 +440,13 @@ class BertHashForMaskedLM(BertPreTrainedModel):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
         """
         return_dict = (
@@ -553,6 +568,13 @@ class BertHashForSequenceClassification(BertPreTrainedModel):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict

         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
     ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        """
         output_attentions = (
             output_attentions
             if output_attentions is not None
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
         """
         return_dict = (
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict

test.py DELETED Viewed

@@ -1,68 +0,0 @@
-from transformers import AutoTokenizer, AutoModel
-import torch
-import os
-import sys
-import io
-import tempfile
-import shutil
-# Mean Pooling - Take attention mask into account for correct averaging
-def meanpooling(output, mask):
-    embeddings = output[
-        0
-    ]  # First element of model_output contains all token embeddings
-    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
-    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
-# Sentences we want sentence embeddings for
-sentences = ["This is an example sentence", "Each sentence is converted"]
-# Load model from local repository (current directory)
-local_model_path = os.getcwd()  # Current directory contains the model files
-print(f"Loading model from local path: {local_model_path}")
-# Suppress all output during model loading (including progress bars to stdout and stderr)
-# Save original file descriptors
-orig_stdout = os.dup(1)
-orig_stderr = os.dup(2)
-null_fd = os.open(os.devnull, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
-# Redirect stdout and stderr to null
-os.dup2(null_fd, 1)
-os.dup2(null_fd, 2)
-try:
-    tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
-    model = AutoModel.from_pretrained(local_model_path, trust_remote_code=True)
-finally:
-    # Restore stdout and stderr
-    os.dup2(orig_stdout, 1)
-    os.dup2(orig_stderr, 2)
-    os.close(null_fd)
-    os.close(orig_stdout)
-    os.close(orig_stderr)
-print(f"Model loaded successfully!")
-# Set model to evaluation mode
-model.eval()
-# Tokenize sentences
-inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
-# Add token_type_ids for transformers 5.x compatibility
-if "token_type_ids" not in inputs or inputs["token_type_ids"] is None:
-    batch_size = inputs["input_ids"].size(0)
-    seq_length = inputs["input_ids"].size(1)
-    inputs["token_type_ids"] = torch.zeros(batch_size, seq_length, dtype=torch.long)
-# Compute token embeddings
-with torch.no_grad():
-    output = model(**inputs)
-# Perform pooling. In this case, mean pooling.
-embeddings = meanpooling(output, inputs["attention_mask"])
-print("Sentence embeddings:")
-print(embeddings)
-print(f"\nEmbeddings shape: {embeddings.shape}")