Spaces:

ZienabM
/

ocr

Sleeping

App Files Files Community

ZienabM commited on 13 days ago

Commit

80af787

verified ·

1 Parent(s): 0dc3fe9

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -41

app.py CHANGED Viewed

@@ -37,10 +37,10 @@ async def lifespan(app: FastAPI):
         MODEL_NAME,
         _attn_implementation="eager",
         trust_remote_code=True,
-        torch_dtype=torch.float32,
     )
     model.eval()
-    log.info("Model ready (cpu)")
     yield
     del model, tokenizer
@@ -54,64 +54,78 @@ from contextlib import contextmanager
 @contextmanager
 def force_cpu():
     """
-    DeepSeek-OCR-2's model.infer() hardcodes .cuda() even when no GPU is present.
-    This context manager temporarily replaces all CUDA-moving calls with no-ops
-    so the model runs on CPU without modification.
     """
-    # Save originals
-    _tensor_cuda   = torch.Tensor.cuda
-    _module_cuda   = torch.nn.Module.cuda
-    _tensor_to     = torch.Tensor.to
-    _module_to     = torch.nn.Module.to
-    # Tensor.cuda() → return self (stay on CPU)
     def _noop_tensor_cuda(self, device=None, *args, **kwargs):
         return self
-    # Module.cuda() → return self
     def _noop_module_cuda(self, device=None):
         return self
-    # Tensor.to("cuda") / to(device) → stay on CPU; allow dtype casts
     def _safe_tensor_to(self, *args, **kwargs):
-        filtered = [
-            a for a in args
-            if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
-        ]
         kwargs.pop("device", None)
-        if filtered or kwargs:
-            try:
-                return _tensor_to(self, *filtered, **kwargs)
-            except Exception:
-                return self
-        return self
-    # Module.to("cuda") → stay on CPU; allow dtype casts
     def _safe_module_to(self, *args, **kwargs):
-        filtered = [
-            a for a in args
-            if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
-        ]
         kwargs.pop("device", None)
-        if filtered or kwargs:
-            try:
-                return _module_to(self, *filtered, **kwargs)
-            except Exception:
-                return self
         return self
-    torch.Tensor.cuda     = _noop_tensor_cuda
-    torch.nn.Module.cuda  = _noop_module_cuda
-    torch.Tensor.to       = _safe_tensor_to
-    torch.nn.Module.to    = _safe_module_to
     try:
         yield
     finally:
-        torch.Tensor.cuda    = _tensor_cuda
-        torch.nn.Module.cuda = _module_cuda
-        torch.Tensor.to      = _tensor_to
-        torch.nn.Module.to   = _module_to
 # ─── Core OCR inference ───────────────────────────────────────────────────────

         MODEL_NAME,
         _attn_implementation="eager",
         trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
     )
     model.eval()
+    log.info("Model ready (cpu, bfloat16)")
     yield
     del model, tokenizer
 @contextmanager
 def force_cpu():
     """
+    DeepSeek-OCR-2's model.infer() has two CPU-breaking issues:
+      1. Hardcodes .cuda() calls  → patched: .cuda() becomes a no-op
+      2. Casts tensors to bfloat16 while model weights are float32
+         → patched: bfloat16 requests are silently changed to float32
+      3. Uses torch.autocast("cuda") which can still cast internally
+         → patched: autocast is replaced with a no-op context manager
+    All patches are reverted after the 'with' block.
     """
+    import contextlib
+    _tensor_cuda    = torch.Tensor.cuda
+    _module_cuda    = torch.nn.Module.cuda
+    _tensor_to      = torch.Tensor.to
+    _module_to      = torch.nn.Module.to
+    _tensor_bf16    = torch.Tensor.bfloat16   # model may call .bfloat16() directly
+    _autocast       = torch.autocast
+    # 1. .cuda() → stay on CPU (no-op)
     def _noop_tensor_cuda(self, device=None, *args, **kwargs):
         return self
     def _noop_module_cuda(self, device=None):
         return self
+    # 2a. .to() → strip CUDA device args; keep dtype as-is
+    #     (model is loaded in bfloat16 so dtype is already consistent)
     def _safe_tensor_to(self, *args, **kwargs):
+        new_args = [a for a in args
+                    if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
         kwargs.pop("device", None)
+        if not new_args and not kwargs:
+            return self
+        try:
+            return _tensor_to(self, *new_args, **kwargs)
+        except Exception:
+            return self
     def _safe_module_to(self, *args, **kwargs):
+        new_args = [a for a in args
+                    if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
         kwargs.pop("device", None)
+        if not new_args and not kwargs:
+            return self
+        try:
+            return _module_to(self, *new_args, **kwargs)
+        except Exception:
+            return self
+    # 2b. .bfloat16() direct calls → no-op (tensor already in bfloat16)
+    def _noop_tensor_bf16(self):
         return self
+    # 3. torch.autocast("cuda", ...) → nullcontext (no-op on CPU)
+    def _noop_autocast(*args, **kwargs):
+        return contextlib.nullcontext()
+    torch.Tensor.cuda    = _noop_tensor_cuda
+    torch.nn.Module.cuda = _noop_module_cuda
+    torch.Tensor.to      = _safe_tensor_to
+    torch.nn.Module.to   = _safe_module_to
+    torch.Tensor.bfloat16 = _noop_tensor_bf16
+    torch.autocast       = _noop_autocast
     try:
         yield
     finally:
+        torch.Tensor.cuda     = _tensor_cuda
+        torch.nn.Module.cuda  = _module_cuda
+        torch.Tensor.to       = _tensor_to
+        torch.nn.Module.to    = _module_to
+        torch.Tensor.bfloat16 = _tensor_bf16
+        torch.autocast        = _autocast
 # ─── Core OCR inference ───────────────────────────────────────────────────────