OpenMOSS-Team
/

MOSS-Audio-Tokenizer-Nano

@@ -16,11 +16,13 @@
 from __future__ import annotations
 import copy
 import math
 import sys
 import types
 from contextlib import ExitStack, contextmanager
 from dataclasses import dataclass
 from pathlib import Path
 from typing import cast
@@ -91,13 +93,31 @@ except ImportError:
 logger = logging.get_logger(__name__)
-try:
-    from flash_attn import flash_attn_varlen_func
-    HAS_FLASH_ATTN = True
-except ImportError:
-    flash_attn_varlen_func = None
-    HAS_FLASH_ATTN = False
 SUPPORTED_ATTENTION_IMPLEMENTATIONS = {"sdpa", "flash_attention_2"}
@@ -328,7 +348,7 @@ class MossAudioTokenizerDecodeSession:
                     decoder_attention_modules.append(module)
         flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
-        if use_cuda_graph and HAS_FLASH_ATTN:
             for module in decoder_attention_modules:
                 module._use_flash_kvcache = True
                 flash_kvcache_attention_modules.append(module)
@@ -1237,7 +1257,7 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         )
     def _supports_flash_attention(self, device: torch.device, dtype: torch.dtype) -> bool:
-        return HAS_FLASH_ATTN and device.type == "cuda" and dtype in {torch.float16, torch.bfloat16}
     def _get_backend_check_dtype(self, x: torch.Tensor) -> torch.dtype:
         if x.device.type != "cuda":
@@ -1265,7 +1285,7 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
                 "(HAS_FLASH_ATTN=%s).",
                 x.device,
                 backend_dtype,
-                HAS_FLASH_ATTN,
             )
         return "sdpa"
@@ -1440,6 +1460,7 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         max_seqlen_q: int,
         max_seqlen_k: int,
     ) -> torch.Tensor:
         if flash_attn_varlen_func is None:
             raise RuntimeError("flash-attn is not installed.")
         window_size = (self.context, 0) if (self.context is not None and self.causal) else (-1, -1)
@@ -1525,10 +1546,11 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         return out
     def _forward_streaming_flash_kvcache(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
-        from flash_attn import flash_attn_with_kvcache
         if self.context is None:
             raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
         batch_size, chunk_length, _ = x.shape
         q, k_cur, v_cur = self._project_qkv(x)

 from __future__ import annotations
 import copy
+import importlib
 import math
 import sys
 import types
 from contextlib import ExitStack, contextmanager
 from dataclasses import dataclass
+from functools import lru_cache
 from pathlib import Path
 from typing import cast
 logger = logging.get_logger(__name__)
+@lru_cache(maxsize=1)
+def _get_flash_attn_module():
+    try:
+        return importlib.import_module("flash_attn")
+    except Exception:
+        return None
+def _has_flash_attn() -> bool:
+    return _get_flash_attn_module() is not None
+def _get_flash_attn_varlen_func():
+    flash_attn_module = _get_flash_attn_module()
+    if flash_attn_module is None:
+        return None
+    return getattr(flash_attn_module, "flash_attn_varlen_func", None)
+def _get_flash_attn_with_kvcache():
+    flash_attn_module = _get_flash_attn_module()
+    if flash_attn_module is None:
+        return None
+    return getattr(flash_attn_module, "flash_attn_with_kvcache", None)
 SUPPORTED_ATTENTION_IMPLEMENTATIONS = {"sdpa", "flash_attention_2"}
                     decoder_attention_modules.append(module)
         flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
+        if use_cuda_graph and _has_flash_attn():
             for module in decoder_attention_modules:
                 module._use_flash_kvcache = True
                 flash_kvcache_attention_modules.append(module)
         )
     def _supports_flash_attention(self, device: torch.device, dtype: torch.dtype) -> bool:
+        return _has_flash_attn() and device.type == "cuda" and dtype in {torch.float16, torch.bfloat16}
     def _get_backend_check_dtype(self, x: torch.Tensor) -> torch.dtype:
         if x.device.type != "cuda":
                 "(HAS_FLASH_ATTN=%s).",
                 x.device,
                 backend_dtype,
+                _has_flash_attn(),
             )
         return "sdpa"
         max_seqlen_q: int,
         max_seqlen_k: int,
     ) -> torch.Tensor:
+        flash_attn_varlen_func = _get_flash_attn_varlen_func()
         if flash_attn_varlen_func is None:
             raise RuntimeError("flash-attn is not installed.")
         window_size = (self.context, 0) if (self.context is not None and self.causal) else (-1, -1)
         return out
     def _forward_streaming_flash_kvcache(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
+        flash_attn_with_kvcache = _get_flash_attn_with_kvcache()
         if self.context is None:
             raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
+        if flash_attn_with_kvcache is None:
+            raise RuntimeError("flash-attn is not installed.")
         batch_size, chunk_length, _ = x.shape
         q, k_cur, v_cur = self._project_qkv(x)