Update app.py
Browse files
app.py
CHANGED
|
@@ -37,10 +37,10 @@ async def lifespan(app: FastAPI):
|
|
| 37 |
MODEL_NAME,
|
| 38 |
_attn_implementation="eager",
|
| 39 |
trust_remote_code=True,
|
| 40 |
-
torch_dtype=torch.
|
| 41 |
)
|
| 42 |
model.eval()
|
| 43 |
-
log.info("Model ready (cpu)")
|
| 44 |
yield
|
| 45 |
del model, tokenizer
|
| 46 |
|
|
@@ -54,64 +54,78 @@ from contextlib import contextmanager
|
|
| 54 |
@contextmanager
|
| 55 |
def force_cpu():
|
| 56 |
"""
|
| 57 |
-
DeepSeek-OCR-2's model.infer()
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
"""
|
| 61 |
-
|
| 62 |
-
_tensor_cuda = torch.Tensor.cuda
|
| 63 |
-
_module_cuda = torch.nn.Module.cuda
|
| 64 |
-
_tensor_to = torch.Tensor.to
|
| 65 |
-
_module_to = torch.nn.Module.to
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def _noop_tensor_cuda(self, device=None, *args, **kwargs):
|
| 69 |
return self
|
| 70 |
|
| 71 |
-
# Module.cuda() β return self
|
| 72 |
def _noop_module_cuda(self, device=None):
|
| 73 |
return self
|
| 74 |
|
| 75 |
-
#
|
|
|
|
| 76 |
def _safe_tensor_to(self, *args, **kwargs):
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
|
| 80 |
-
]
|
| 81 |
kwargs.pop("device", None)
|
| 82 |
-
if
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
-
# Module.to("cuda") β stay on CPU; allow dtype casts
|
| 90 |
def _safe_module_to(self, *args, **kwargs):
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
|
| 94 |
-
]
|
| 95 |
kwargs.pop("device", None)
|
| 96 |
-
if
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
return self
|
| 102 |
|
| 103 |
-
torch.
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
try:
|
| 109 |
yield
|
| 110 |
finally:
|
| 111 |
-
torch.Tensor.cuda
|
| 112 |
-
torch.nn.Module.cuda
|
| 113 |
-
torch.Tensor.to
|
| 114 |
-
torch.nn.Module.to
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
# βββ Core OCR inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 37 |
MODEL_NAME,
|
| 38 |
_attn_implementation="eager",
|
| 39 |
trust_remote_code=True,
|
| 40 |
+
torch_dtype=torch.bfloat16,
|
| 41 |
)
|
| 42 |
model.eval()
|
| 43 |
+
log.info("Model ready (cpu, bfloat16)")
|
| 44 |
yield
|
| 45 |
del model, tokenizer
|
| 46 |
|
|
|
|
| 54 |
@contextmanager
|
| 55 |
def force_cpu():
|
| 56 |
"""
|
| 57 |
+
DeepSeek-OCR-2's model.infer() has two CPU-breaking issues:
|
| 58 |
+
1. Hardcodes .cuda() calls β patched: .cuda() becomes a no-op
|
| 59 |
+
2. Casts tensors to bfloat16 while model weights are float32
|
| 60 |
+
β patched: bfloat16 requests are silently changed to float32
|
| 61 |
+
3. Uses torch.autocast("cuda") which can still cast internally
|
| 62 |
+
β patched: autocast is replaced with a no-op context manager
|
| 63 |
+
All patches are reverted after the 'with' block.
|
| 64 |
"""
|
| 65 |
+
import contextlib
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
_tensor_cuda = torch.Tensor.cuda
|
| 68 |
+
_module_cuda = torch.nn.Module.cuda
|
| 69 |
+
_tensor_to = torch.Tensor.to
|
| 70 |
+
_module_to = torch.nn.Module.to
|
| 71 |
+
_tensor_bf16 = torch.Tensor.bfloat16 # model may call .bfloat16() directly
|
| 72 |
+
_autocast = torch.autocast
|
| 73 |
+
|
| 74 |
+
# 1. .cuda() β stay on CPU (no-op)
|
| 75 |
def _noop_tensor_cuda(self, device=None, *args, **kwargs):
|
| 76 |
return self
|
| 77 |
|
|
|
|
| 78 |
def _noop_module_cuda(self, device=None):
|
| 79 |
return self
|
| 80 |
|
| 81 |
+
# 2a. .to() β strip CUDA device args; keep dtype as-is
|
| 82 |
+
# (model is loaded in bfloat16 so dtype is already consistent)
|
| 83 |
def _safe_tensor_to(self, *args, **kwargs):
|
| 84 |
+
new_args = [a for a in args
|
| 85 |
+
if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
|
|
|
|
|
|
|
| 86 |
kwargs.pop("device", None)
|
| 87 |
+
if not new_args and not kwargs:
|
| 88 |
+
return self
|
| 89 |
+
try:
|
| 90 |
+
return _tensor_to(self, *new_args, **kwargs)
|
| 91 |
+
except Exception:
|
| 92 |
+
return self
|
| 93 |
|
|
|
|
| 94 |
def _safe_module_to(self, *args, **kwargs):
|
| 95 |
+
new_args = [a for a in args
|
| 96 |
+
if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
|
|
|
|
|
|
|
| 97 |
kwargs.pop("device", None)
|
| 98 |
+
if not new_args and not kwargs:
|
| 99 |
+
return self
|
| 100 |
+
try:
|
| 101 |
+
return _module_to(self, *new_args, **kwargs)
|
| 102 |
+
except Exception:
|
| 103 |
+
return self
|
| 104 |
+
|
| 105 |
+
# 2b. .bfloat16() direct calls β no-op (tensor already in bfloat16)
|
| 106 |
+
def _noop_tensor_bf16(self):
|
| 107 |
return self
|
| 108 |
|
| 109 |
+
# 3. torch.autocast("cuda", ...) β nullcontext (no-op on CPU)
|
| 110 |
+
def _noop_autocast(*args, **kwargs):
|
| 111 |
+
return contextlib.nullcontext()
|
| 112 |
+
|
| 113 |
+
torch.Tensor.cuda = _noop_tensor_cuda
|
| 114 |
+
torch.nn.Module.cuda = _noop_module_cuda
|
| 115 |
+
torch.Tensor.to = _safe_tensor_to
|
| 116 |
+
torch.nn.Module.to = _safe_module_to
|
| 117 |
+
torch.Tensor.bfloat16 = _noop_tensor_bf16
|
| 118 |
+
torch.autocast = _noop_autocast
|
| 119 |
|
| 120 |
try:
|
| 121 |
yield
|
| 122 |
finally:
|
| 123 |
+
torch.Tensor.cuda = _tensor_cuda
|
| 124 |
+
torch.nn.Module.cuda = _module_cuda
|
| 125 |
+
torch.Tensor.to = _tensor_to
|
| 126 |
+
torch.nn.Module.to = _module_to
|
| 127 |
+
torch.Tensor.bfloat16 = _tensor_bf16
|
| 128 |
+
torch.autocast = _autocast
|
| 129 |
|
| 130 |
|
| 131 |
# βββ Core OCR inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|