brocks1234
/

dnabert2-langgraph-handler

Model card Files Files and versions

xet

Community

brocks1234 commited on Apr 21

Commit

364362c

verified ·

1 Parent(s): 773f0e8

Update handler.py

Browse files

Files changed (1) hide show

handler.py +25 -13

handler.py CHANGED Viewed

@@ -1,35 +1,47 @@
 from typing import Any, Dict, List
-from transformers import AutoTokenizer, AutoModel
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
-        # We point directly to the original weights
         self.model_id = "zhihan1996/DNABERT-2-117M"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
-        # --- FIX: Disable Flash Attention to avoid the Triton error ---
-        from transformers import AutoConfig
         config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
-        config.use_flash_attn = False  # This bypasses the broken line 114 code
-        # --------------------------------------------------------------
-        self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True)
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
     def __call__(self, data: Dict[str, Any]) -> List[float]:
         inputs = data.pop("inputs", data)
-        # DNA Tokenization
         encoded_input = self.tokenizer(inputs, return_tensors='pt')
         if torch.cuda.is_available():
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
         with torch.no_grad():
             outputs = self.model(**encoded_input)
-        # Returns a 768-dimensional vector representing the DNA sequence
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings

+import sys
+from unittest.mock import MagicMock
+# 1. MOCK TRITON: This trick prevents the model from even TRYING to load the broken code
+sys.modules["triton"] = MagicMock()
+sys.modules["triton.language"] = MagicMock()
 from typing import Any, Dict, List
+from transformers import AutoTokenizer, AutoModel, AutoConfig
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
         self.model_id = "zhihan1996/DNABERT-2-117M"
+        # 2. FORCE CONFIG: Explicitly disable flash attention in multiple places
         config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
+        config.use_flash_attn = False
+        if hasattr(config, "auto_map"):
+            # This ensures it doesn't try to use the custom 'Flash' modeling class
+            config.auto_map["AutoModel"] = "modeling_bert.BertModel"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+        # 3. LOAD MODEL: Pass the specific config and force trust_remote_code
+        self.model = AutoModel.from_pretrained(
+            self.model_id,
+            config=config,
+            trust_remote_code=True
+        )
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
     def __call__(self, data: Dict[str, Any]) -> List[float]:
         inputs = data.pop("inputs", data)
         encoded_input = self.tokenizer(inputs, return_tensors='pt')
         if torch.cuda.is_available():
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
         with torch.no_grad():
             outputs = self.model(**encoded_input)
+        # Mean pooling to get sequence embedding
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings