Update app.py
Browse files
app.py
CHANGED
|
@@ -38,7 +38,6 @@ async def lifespan(app: FastAPI):
|
|
| 38 |
_attn_implementation="eager",
|
| 39 |
trust_remote_code=True,
|
| 40 |
torch_dtype=torch.float32,
|
| 41 |
-
use_safetensors=True,
|
| 42 |
)
|
| 43 |
model.eval()
|
| 44 |
log.info("Model ready (cpu)")
|
|
@@ -55,24 +54,25 @@ from contextlib import contextmanager
|
|
| 55 |
@contextmanager
|
| 56 |
def force_cpu():
|
| 57 |
"""
|
| 58 |
-
DeepSeek-OCR-2's model.infer() hardcodes .cuda()
|
| 59 |
-
This context manager
|
| 60 |
-
|
| 61 |
-
2. Completely DISABLES torch.autocast so bfloat16 is never applied
|
| 62 |
"""
|
| 63 |
# Save originals
|
| 64 |
_tensor_cuda = torch.Tensor.cuda
|
| 65 |
_module_cuda = torch.nn.Module.cuda
|
| 66 |
_tensor_to = torch.Tensor.to
|
| 67 |
_module_to = torch.nn.Module.to
|
| 68 |
-
_autocast = torch.autocast
|
| 69 |
|
|
|
|
| 70 |
def _noop_tensor_cuda(self, device=None, *args, **kwargs):
|
| 71 |
return self
|
| 72 |
|
|
|
|
| 73 |
def _noop_module_cuda(self, device=None):
|
| 74 |
return self
|
| 75 |
|
|
|
|
| 76 |
def _safe_tensor_to(self, *args, **kwargs):
|
| 77 |
filtered = [
|
| 78 |
a for a in args
|
|
@@ -86,6 +86,7 @@ def force_cpu():
|
|
| 86 |
return self
|
| 87 |
return self
|
| 88 |
|
|
|
|
| 89 |
def _safe_module_to(self, *args, **kwargs):
|
| 90 |
filtered = [
|
| 91 |
a for a in args
|
|
@@ -99,38 +100,25 @@ def force_cpu():
|
|
| 99 |
return self
|
| 100 |
return self
|
| 101 |
|
| 102 |
-
# ุชุนุทูู autocast ุจุงููุงู
ู โ ูุง ูุฑูุฏู ูุญููู ุฃู ุดูุก ุฅูู bfloat16
|
| 103 |
-
@contextmanager
|
| 104 |
-
def _disabled_autocast(device_type=None, dtype=None, enabled=True, cache_enabled=None):
|
| 105 |
-
"""
|
| 106 |
-
Completely disables autocast.
|
| 107 |
-
The model code calls: with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 108 |
-
We replace it with a no-op context manager that does nothing.
|
| 109 |
-
"""
|
| 110 |
-
yield # ูุง ุชูุนู ุดูุฆุงู โ ูุง ุชุญููู ุฃููุงุน
|
| 111 |
-
|
| 112 |
-
# Apply patches
|
| 113 |
torch.Tensor.cuda = _noop_tensor_cuda
|
| 114 |
torch.nn.Module.cuda = _noop_module_cuda
|
| 115 |
torch.Tensor.to = _safe_tensor_to
|
| 116 |
torch.nn.Module.to = _safe_module_to
|
| 117 |
-
torch.autocast = _disabled_autocast
|
| 118 |
|
| 119 |
try:
|
| 120 |
yield
|
| 121 |
finally:
|
| 122 |
-
# Restore originals
|
| 123 |
torch.Tensor.cuda = _tensor_cuda
|
| 124 |
torch.nn.Module.cuda = _module_cuda
|
| 125 |
torch.Tensor.to = _tensor_to
|
| 126 |
torch.nn.Module.to = _module_to
|
| 127 |
-
|
| 128 |
|
| 129 |
# โโโ Core OCR inference โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 130 |
def run_ocr(pil_image: Image.Image, mode: str = "free") -> str:
|
| 131 |
"""
|
| 132 |
Run DeepSeek-OCR-2 on a PIL image and return extracted text.
|
| 133 |
-
|
| 134 |
"""
|
| 135 |
prompt_text = (
|
| 136 |
"Convert the document to markdown."
|
|
@@ -144,9 +132,8 @@ def run_ocr(pil_image: Image.Image, mode: str = "free") -> str:
|
|
| 144 |
|
| 145 |
try:
|
| 146 |
if hasattr(model, "infer"):
|
| 147 |
-
# โ ๏ธ ุฅุฌุจุงุฑ ุงููู
ูุฐุฌ ุจุงููุงู
ู ุนูู float32 ูุจู ุงูุงุณุชุฎุฏุงู
|
| 148 |
-
model.to(torch.float32)
|
| 149 |
with tempfile.TemporaryDirectory() as out_dir:
|
|
|
|
| 150 |
with force_cpu():
|
| 151 |
result = model.infer(
|
| 152 |
tokenizer,
|
|
@@ -162,7 +149,7 @@ def run_ocr(pil_image: Image.Image, mode: str = "free") -> str:
|
|
| 162 |
return result.get("text", str(result))
|
| 163 |
return str(result) if result else ""
|
| 164 |
|
| 165 |
-
# Fallback
|
| 166 |
messages = [{"role": "user", "content": [
|
| 167 |
{"type": "image", "image": tmp_path},
|
| 168 |
{"type": "text", "text": prompt_text},
|
|
|
|
| 38 |
_attn_implementation="eager",
|
| 39 |
trust_remote_code=True,
|
| 40 |
torch_dtype=torch.float32,
|
|
|
|
| 41 |
)
|
| 42 |
model.eval()
|
| 43 |
log.info("Model ready (cpu)")
|
|
|
|
| 54 |
@contextmanager
|
| 55 |
def force_cpu():
|
| 56 |
"""
|
| 57 |
+
DeepSeek-OCR-2's model.infer() hardcodes .cuda() even when no GPU is present.
|
| 58 |
+
This context manager temporarily replaces all CUDA-moving calls with no-ops
|
| 59 |
+
so the model runs on CPU without modification.
|
|
|
|
| 60 |
"""
|
| 61 |
# Save originals
|
| 62 |
_tensor_cuda = torch.Tensor.cuda
|
| 63 |
_module_cuda = torch.nn.Module.cuda
|
| 64 |
_tensor_to = torch.Tensor.to
|
| 65 |
_module_to = torch.nn.Module.to
|
|
|
|
| 66 |
|
| 67 |
+
# Tensor.cuda() โ return self (stay on CPU)
|
| 68 |
def _noop_tensor_cuda(self, device=None, *args, **kwargs):
|
| 69 |
return self
|
| 70 |
|
| 71 |
+
# Module.cuda() โ return self
|
| 72 |
def _noop_module_cuda(self, device=None):
|
| 73 |
return self
|
| 74 |
|
| 75 |
+
# Tensor.to("cuda") / to(device) โ stay on CPU; allow dtype casts
|
| 76 |
def _safe_tensor_to(self, *args, **kwargs):
|
| 77 |
filtered = [
|
| 78 |
a for a in args
|
|
|
|
| 86 |
return self
|
| 87 |
return self
|
| 88 |
|
| 89 |
+
# Module.to("cuda") โ stay on CPU; allow dtype casts
|
| 90 |
def _safe_module_to(self, *args, **kwargs):
|
| 91 |
filtered = [
|
| 92 |
a for a in args
|
|
|
|
| 100 |
return self
|
| 101 |
return self
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
torch.Tensor.cuda = _noop_tensor_cuda
|
| 104 |
torch.nn.Module.cuda = _noop_module_cuda
|
| 105 |
torch.Tensor.to = _safe_tensor_to
|
| 106 |
torch.nn.Module.to = _safe_module_to
|
|
|
|
| 107 |
|
| 108 |
try:
|
| 109 |
yield
|
| 110 |
finally:
|
|
|
|
| 111 |
torch.Tensor.cuda = _tensor_cuda
|
| 112 |
torch.nn.Module.cuda = _module_cuda
|
| 113 |
torch.Tensor.to = _tensor_to
|
| 114 |
torch.nn.Module.to = _module_to
|
| 115 |
+
|
| 116 |
|
| 117 |
# โโโ Core OCR inference โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 118 |
def run_ocr(pil_image: Image.Image, mode: str = "free") -> str:
|
| 119 |
"""
|
| 120 |
Run DeepSeek-OCR-2 on a PIL image and return extracted text.
|
| 121 |
+
Works on both CPU (HF free tier) and GPU.
|
| 122 |
"""
|
| 123 |
prompt_text = (
|
| 124 |
"Convert the document to markdown."
|
|
|
|
| 132 |
|
| 133 |
try:
|
| 134 |
if hasattr(model, "infer"):
|
|
|
|
|
|
|
| 135 |
with tempfile.TemporaryDirectory() as out_dir:
|
| 136 |
+
# force_cpu() patches .cuda() โ no-op so model.infer() works on CPU
|
| 137 |
with force_cpu():
|
| 138 |
result = model.infer(
|
| 139 |
tokenizer,
|
|
|
|
| 149 |
return result.get("text", str(result))
|
| 150 |
return str(result) if result else ""
|
| 151 |
|
| 152 |
+
# โโ Fallback: standard generate() if model.infer() is not available โโ
|
| 153 |
messages = [{"role": "user", "content": [
|
| 154 |
{"type": "image", "image": tmp_path},
|
| 155 |
{"type": "text", "text": prompt_text},
|