anubhavg97
/

constbert-onnx

@@ -16,9 +16,12 @@ try:
     model.eval()
     print("✓ Model set to evaluation mode")
-    print("Preparing dummy input for ONNX export...")
-    dummy_input_ids = torch.ones((1, 512), dtype=torch.long)
-    dummy_attention_mask = torch.ones((1, 512), dtype=torch.long)
     print("✓ Dummy input prepared")
     print("Exporting model to ONNX format...")

     model.eval()
     print("✓ Model set to evaluation mode")
+    # Use the doc_maxlen from the *loaded model's* colbert_config
+    actual_doc_maxlen = model.colbert_config.doc_maxlen
+    print(f"DEBUG: model.colbert_config.doc_maxlen = {actual_doc_maxlen}")
+    print(f"Preparing dummy input for ONNX export with doc_maxlen={actual_doc_maxlen}...")
+    dummy_input_ids = torch.ones((1, actual_doc_maxlen), dtype=torch.long)
+    dummy_attention_mask = torch.ones((1, actual_doc_maxlen), dtype=torch.long)
     print("✓ Dummy input prepared")
     print("Exporting model to ONNX format...")

handler.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# handler.py
+import os
+import onnxruntime as ort
+import numpy as np
+from transformers import AutoTokenizer
+from typing import Dict, List, Any
+from colbert_configuration import ColBERTConfig # Import ColBERTConfig
+# Assuming modeling.py and colbert_configuration.py are in the same directory
+# We'll use local imports since this handler will run within the model's directory context
+# For ConstBERT to be recognized, you need to ensure these are importable.
+# If you run into issues, consider a custom Docker image or ensuring the model
+# is loadable via AutoModel.from_pretrained if it has auto_map in config.json
+# For simplicity, we're relying on ConstBERT.from_pretrained working with ONNXRuntime path.
+# Note: The EndpointHandler class must be named exactly this.
+class EndpointHandler:
+    def __init__(self, path=""): # path will be '/repository' on HF Endpoints
+        # `path` is the directory where your model files (model.onnx, tokenizer files) are located.
+        # Load the tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        print(f"Tokenizer loaded from: {path}")
+        # Load ColBERTConfig to get doc_maxlen for consistent padding
+        # IMPORTANT: Use load_from_checkpoint to get the *exact* config used for model export.
+        self.colbert_config = ColBERTConfig.load_from_checkpoint(path)
+        self.doc_max_length = self.colbert_config.doc_maxlen
+        print(f"ColBERTConfig doc_maxlen loaded as: {self.doc_max_length}")
+        # Load the ONNX model
+        onnx_model_path = os.path.join(path, "model.onnx")
+        self.session = ort.InferenceSession(onnx_model_path)
+        print(f"ONNX model loaded from: {onnx_model_path}")
+        # Get input names from the ONNX model
+        self.input_names = [input.name for input in self.session.get_inputs()]
+        print(f"ONNX input names: {self.input_names}")
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Inference call for the endpoint.
+        Args:
+            data (Dict[str, Any]): The request payload.
+                                   Expected to contain "inputs" (str or list of str).
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, where each dict
+                                  contains the raw multi-vector output for an input.
+                                  Example: [{"embedding": [[...], [...], ...]}, ...]
+        """
+        inputs = data.pop("inputs", None)
+        if inputs is None:
+            raise ValueError("No 'inputs' found in the request payload.")
+        # Ensure inputs is a list
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        # Tokenize the inputs, ensuring consistent padding/truncation to doc_max_length
+        tokenized_inputs = self.tokenizer(
+            inputs,
+            padding="max_length",  # Use max_length padding
+            truncation=True,
+            max_length=self.doc_max_length, # Use the loaded doc_max_length
+            return_tensors="np"
+        )
+        input_ids = tokenized_inputs["input_ids"]
+        attention_mask = tokenized_inputs["attention_mask"]
+        # Prepare ONNX input dictionary
+        onnx_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask
+        }
+        # Run ONNX inference
+        outputs = self.session.run(None, onnx_inputs)
+        # The first output is your multi-vector embedding
+        multi_vector_embeddings = outputs[0]
+        # Convert to list of lists (JSON serializable)
+        # Assuming batch_size will be 1 for typical endpoint requests, but handling potential batching from client for robustness.
+        result_list = []
+        for i in range(multi_vector_embeddings.shape[0]):
+            # Each element in the result_list will be a dictionary for one input,
+            # containing its multi-vector embedding (fixed 32 x 128)
+            result_list.append({"embedding": multi_vector_embeddings[i].tolist()})
+        return result_list

model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:30dae9a99d07f56c103a09173deaa9f76f141976ca20dd8f7e5a5cce8152dee8
-size 436269030

 version https://git-lfs.github.com/spec/v1
+oid sha256:d515b85a59a302d13d04b3a45c6211b3e1893a2718c13598231acc18825f0f02
+size 436300888

modeling.py CHANGED Viewed

@@ -60,6 +60,7 @@ class ConstBERT(BertPreTrainedModel):
         super().__init__(config)
         self.config = config
         self.dim = colbert_config.dim
         self.linear = nn.Linear(config.hidden_size, colbert_config.dim, bias=False)
         self.doc_project = nn.Linear(colbert_config.doc_maxlen, 32, bias=False)
@@ -132,33 +133,46 @@ class ConstBERT(BertPreTrainedModel):
     def forward(self, input_ids, attention_mask):
         """
         Forward method for ONNX export and PyTorch compatibility.
-        This simply calls the existing _query method, preserving all current model behavior.
         """
-        return self._query(input_ids, attention_mask)
     def _doc(self, input_ids, attention_mask, keep_dims=True):
         assert keep_dims in [True, False, 'return_mask']
         input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
-        D = self.bert(input_ids, attention_mask=attention_mask)[0]
-        D = D.permute(0, 2, 1) #(64, 128,180)
-        D = self.doc_project(D) #(64, 128,16)
-        D = D.permute(0, 2, 1) #(64,16,128)
-        D = self.linear(D)
-        mask = torch.ones(D.shape[0], D.shape[1], device=self.device).unsqueeze(2).float()
-        # mask = torch.tensor(self.mask(input_ids, skiplist=self.skiplist), device=self.device).unsqueeze(2).float()
-        D = D * mask
         D = torch.nn.functional.normalize(D, p=2, dim=2)
         if self.use_gpu:
             D = D.half()
-        if keep_dims is False:
-            D, mask = D.cpu(), mask.bool().cpu().squeeze(-1)
-            D = [d[mask[idx]] for idx, d in enumerate(D)]
-        elif keep_dims == 'return_mask':
-            return D, mask.bool()
         return D

         super().__init__(config)
         self.config = config
+        self.colbert_config = colbert_config
         self.dim = colbert_config.dim
         self.linear = nn.Linear(config.hidden_size, colbert_config.dim, bias=False)
         self.doc_project = nn.Linear(colbert_config.doc_maxlen, 32, bias=False)
     def forward(self, input_ids, attention_mask):
         """
         Forward method for ONNX export and PyTorch compatibility.
+        This will now call _doc to produce a fixed number of vectors.
         """
+        return self._doc(input_ids, attention_mask)
     def _doc(self, input_ids, attention_mask, keep_dims=True):
         assert keep_dims in [True, False, 'return_mask']
         input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
+        D = self.bert(input_ids, attention_mask=attention_mask)[0] # Shape: (batch_size, seq_len, hidden_size)
+        # First, apply linear layer to project hidden_size to colbert_config.dim (128)
+        D = self.linear(D) # Shape: (batch_size, seq_len, dim)
+        # Now, permute to put seq_len in the feature dimension for doc_project
+        D = D.permute(0, 2, 1) # Shape: (batch_size, dim, seq_len)
+        # Apply doc_project to reduce seq_len (e.g., 250) to fixed length (32)
+        # The nn.Linear(in_features, out_features) operates on the last dimension.
+        # So it expects the last dimension to be seq_len (doc_maxlen).
+        # It will transform it to (batch_size, dim, 32)
+        D = self.doc_project(D) # Shape: (batch_size, dim, 32)
+        # Permute back to (batch_size, 32, dim)
+        D = D.permute(0, 2, 1) # Shape: (batch_size, 32, dim)
+        # Apply mask (assuming it's still needed in this part of the flow)
+        # The mask now needs to be applied correctly to the (batch_size, 32, dim) shape
+        # For now, let's simplify mask application or ensure it's handled correctly if it remains a static shape.
+        # Given the fixed output, the original masking might be less critical here, or needs to be re-evaluated.
+        # Temporarily removing original mask logic in _doc to avoid immediate conflict.
+        # If a learned mask is needed on the 32 vectors, it needs separate logic.
+        # mask = torch.ones(D.shape[0], D.shape[1], device=self.device).unsqueeze(2).float()
+        # D = D * mask
         D = torch.nn.functional.normalize(D, p=2, dim=2)
         if self.use_gpu:
             D = D.half()
+        # Removed keep_dims conditional branches as _doc now consistently returns fixed 32 vectors.
         return D

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+onnxruntime
+transformers
+numpy
+torch # Required by your modeling.py for ConstBERT logic