brocks1234
/

dnabert2-langgraph-handler

Model card Files Files and versions

xet

Community

brocks1234 commited on 28 days ago

Commit

1214747

verified ·

1 Parent(s): 364362c

Update handler.py

Browse files

Files changed (1) hide show

handler.py +27 -20

handler.py CHANGED Viewed

@@ -1,40 +1,45 @@
-import sys
-from unittest.mock import MagicMock
-# 1. MOCK TRITON: This trick prevents the model from even TRYING to load the broken code
-sys.modules["triton"] = MagicMock()
-sys.modules["triton.language"] = MagicMock()
-from typing import Any, Dict, List
-from transformers import AutoTokenizer, AutoModel, AutoConfig
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
         self.model_id = "zhihan1996/DNABERT-2-117M"
-        # 2. FORCE CONFIG: Explicitly disable flash attention in multiple places
-        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
-        config.use_flash_attn = False
-        if hasattr(config, "auto_map"):
-            # This ensures it doesn't try to use the custom 'Flash' modeling class
-            config.auto_map["AutoModel"] = "modeling_bert.BertModel"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
-        # 3. LOAD MODEL: Pass the specific config and force trust_remote_code
-        self.model = AutoModel.from_pretrained(
             self.model_id,
             config=config,
-            trust_remote_code=True
         )
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
     def __call__(self, data: Dict[str, Any]) -> List[float]:
         inputs = data.pop("inputs", data)
-        encoded_input = self.tokenizer(inputs, return_tensors='pt')
         if torch.cuda.is_available():
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
@@ -42,6 +47,8 @@ class EndpointHandler:
         with torch.no_grad():
             outputs = self.model(**encoded_input)
-        # Mean pooling to get sequence embedding
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings

 import torch
+from typing import Any, Dict, List
+from transformers import AutoTokenizer, BertModel, BertConfig
 class EndpointHandler:
     def __init__(self, path=""):
         self.model_id = "zhihan1996/DNABERT-2-117M"
+        # 1. Use a standard BERT config instead of the custom DNABERT one
+        # This prevents the 'flash_attn_triton.py' from ever being triggered
+        config = BertConfig.from_pretrained(self.model_id)
+        # 2. Load the tokenizer normally
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+        # 3. Load as a standard BertModel
+        # We use 'trust_remote_code=False' here to force standard layers
+        self.model = BertModel.from_pretrained(
             self.model_id,
             config=config,
+            trust_remote_code=False,
+            ignore_mismatched_sizes=True
         )
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
+        self.model.eval()
     def __call__(self, data: Dict[str, Any]) -> List[float]:
+        # Handle input strings or dictionaries
         inputs = data.pop("inputs", data)
+        if isinstance(inputs, list):
+            inputs = inputs[0]
+        # Standard tokenization
+        encoded_input = self.tokenizer(
+            inputs,
+            return_tensors='pt',
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
         if torch.cuda.is_available():
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
         with torch.no_grad():
             outputs = self.model(**encoded_input)
+        # Get the hidden states and perform mean pooling
+        # index 0 is the last_hidden_state
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings