Spaces:

algoryn
/

dots-ocr-idcard

Paused

App Files Files Community

tommulder commited on Sep 10, 2025

Commit

420a04f

1 Parent(s): 399c4d1

perf(attn): default to SDPA; gracefully fallback when flash_attn missing; use dtype arg

Browse files

Files changed (1) hide show

src/kybtech_dots_ocr/model_loader.py +26 -11

src/kybtech_dots_ocr/model_loader.py CHANGED Viewed

@@ -23,7 +23,7 @@ REPO_ID = os.getenv("DOTS_OCR_REPO_ID", "rednote-hilab/dots.ocr")
 LOCAL_DIR = os.getenv("DOTS_OCR_LOCAL_DIR", "/data/models/dots-ocr")
 DEVICE_CONFIG = os.getenv("DOTS_OCR_DEVICE", "auto")
 MAX_NEW_TOKENS = int(os.getenv("DOTS_OCR_MAX_NEW_TOKENS", "2048"))
-USE_FLASH_ATTENTION = os.getenv("DOTS_OCR_FLASH_ATTENTION", "1") == "1"
 MIN_PIXELS = int(os.getenv("DOTS_OCR_MIN_PIXELS", "3136"))  # 56x56
 MAX_PIXELS = int(os.getenv("DOTS_OCR_MAX_PIXELS", "11289600"))  # 3360x3360
 CUSTOM_PROMPT = os.getenv("DOTS_OCR_PROMPT")
@@ -91,6 +91,22 @@ class DotsOCRModelLoader:
         except Exception as e:
             logger.error(f"Failed to download model: {e}")
             raise RuntimeError(f"Model download failed: {e}")
     def load_model(self) -> None:
         """Load the Dots.OCR model and processor."""
@@ -110,21 +126,20 @@ class DotsOCRModelLoader:
             # Load model with appropriate configuration
             model_kwargs = {
-                "torch_dtype": self.dtype,
                 "trust_remote_code": True,
             }
             # Add device-specific configurations
             if self.device == "cuda":
-                # Use flash attention if available and requested
-                if USE_FLASH_ATTENTION:
-                    try:
-                        model_kwargs["attn_implementation"] = "flash_attention_2"
-                        logger.info("Using flash attention 2")
-                    except Exception as e:
-                        logger.warning(f"Flash attention not available: {e}")
-                        logger.info("Falling back to standard attention")
                 # Use device_map for automatic GPU memory management
                 model_kwargs["device_map"] = "auto"
             else:

 LOCAL_DIR = os.getenv("DOTS_OCR_LOCAL_DIR", "/data/models/dots-ocr")
 DEVICE_CONFIG = os.getenv("DOTS_OCR_DEVICE", "auto")
 MAX_NEW_TOKENS = int(os.getenv("DOTS_OCR_MAX_NEW_TOKENS", "2048"))
+USE_FLASH_ATTENTION = os.getenv("DOTS_OCR_FLASH_ATTENTION", "0") == "1"
 MIN_PIXELS = int(os.getenv("DOTS_OCR_MIN_PIXELS", "3136"))  # 56x56
 MAX_PIXELS = int(os.getenv("DOTS_OCR_MAX_PIXELS", "11289600"))  # 3360x3360
 CUSTOM_PROMPT = os.getenv("DOTS_OCR_PROMPT")
         except Exception as e:
             logger.error(f"Failed to download model: {e}")
             raise RuntimeError(f"Model download failed: {e}")
+    def _can_use_flash_attn(self) -> bool:
+        """Check whether FlashAttention2 can be enabled safely.
+        Returns True only if the package is importable and dtype is fp16/bf16.
+        """
+        if not USE_FLASH_ATTENTION:
+            return False
+        try:
+            # Import check avoids runtime error from Transformers if not installed
+            import flash_attn  # type: ignore  # noqa: F401
+        except Exception:
+            logger.warning("flash_attn package not installed; disabling FlashAttention2")
+            return False
+        # FlashAttention2 supports fp16/bf16 only (see HF docs)
+        return self.dtype in (torch.float16, torch.bfloat16)
     def load_model(self) -> None:
         """Load the Dots.OCR model and processor."""
             # Load model with appropriate configuration
             model_kwargs = {
+                "dtype": self.dtype,  # torch_dtype is deprecated
                 "trust_remote_code": True,
             }
             # Add device-specific configurations
             if self.device == "cuda":
+                # Prefer FlashAttention2 when truly available; otherwise SDPA
+                if self._can_use_flash_attn():
+                    model_kwargs["attn_implementation"] = "flash_attention_2"
+                    logger.info("Using flash attention 2")
+                else:
+                    model_kwargs["attn_implementation"] = "sdpa"
+                    logger.info("Using SDPA attention (flash-attn unavailable or disabled)")
                 # Use device_map for automatic GPU memory management
                 model_kwargs["device_map"] = "auto"
             else: