brocks1234
/

dnabert2-langgraph-handler

Model card Files Files and versions

xet

Community

brocks1234 commited on Apr 21

Commit

8fee4ab

verified ·

1 Parent(s): 40ac285

Update handler.py

Browse files

Files changed (1) hide show

handler.py +13 -24

handler.py CHANGED Viewed

@@ -1,10 +1,6 @@
-import sys
-from unittest.mock import MagicMock
-# 1. GLOBAL BLACKOUT: Must be at the very top, before any other imports
-# This makes Triton invisible to every script the model downloads.
-sys.modules["triton"] = MagicMock()
-sys.modules["triton.language"] = MagicMock()
 import torch
 from typing import Any, Dict, List
@@ -14,29 +10,21 @@ class EndpointHandler:
     def __init__(self, path=""):
         self.model_id = "zhihan1996/DNABERT-2-117M"
-        # 2. Config level: Explicitly set flash_attn to False in the config object
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
-        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
-        # Some custom implementations check for 'use_flash_attn' or 'flash_attn'
         config.use_flash_attn = False
-        if hasattr(config, "auto_map"):
-            # Force it to use the standard modeling rather than the Triton-based one
-            config.auto_map["AutoModel"] = "modeling_bert.BertModel"
-        # 3. Load Model
         self.model = AutoModel.from_pretrained(
             self.model_id,
             trust_remote_code=True,
-            config=config
         )
-        # 4. Layer Level: Double-check the individual attention layers
-        # This is our last-resort safety net
-        for module in self.model.modules():
-            if hasattr(module, "use_flash_attn"):
-                module.use_flash_attn = False
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
         self.model.eval()
@@ -58,8 +46,9 @@ class EndpointHandler:
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
         with torch.no_grad():
-            outputs = self.model(**encoded_input)
-        # Mean pooling
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings

+import os
+# Force PyTorch to use its built-in stable attention and ignore custom kernels
+os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1"
 import torch
 from typing import Any, Dict, List
     def __init__(self, path=""):
         self.model_id = "zhihan1996/DNABERT-2-117M"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+        # 1. Load config and EXPLICITLY set the attn_implementation to 'eager'
+        # 'eager' means 'plain PyTorch math' - no Triton, no Flash, just stability.
+        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
         config.use_flash_attn = False
+        # 2. Load Model with the 'eager' implementation if supported
         self.model = AutoModel.from_pretrained(
             self.model_id,
             trust_remote_code=True,
+            config=config,
+            attn_implementation="eager"
         )
         if torch.cuda.is_available():
             self.model = self.model.to("cuda")
         self.model.eval()
             encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
         with torch.no_grad():
+            # 3. Use the inference mode context for extra stability
+            with torch.inference_mode():
+                outputs = self.model(**encoded_input)
         embeddings = outputs[0][0].mean(dim=0).cpu().numpy().tolist()
         return embeddings