ZienabM commited on
Commit
80af787
Β·
verified Β·
1 Parent(s): 0dc3fe9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -41
app.py CHANGED
@@ -37,10 +37,10 @@ async def lifespan(app: FastAPI):
37
  MODEL_NAME,
38
  _attn_implementation="eager",
39
  trust_remote_code=True,
40
- torch_dtype=torch.float32,
41
  )
42
  model.eval()
43
- log.info("Model ready (cpu)")
44
  yield
45
  del model, tokenizer
46
 
@@ -54,64 +54,78 @@ from contextlib import contextmanager
54
  @contextmanager
55
  def force_cpu():
56
  """
57
- DeepSeek-OCR-2's model.infer() hardcodes .cuda() even when no GPU is present.
58
- This context manager temporarily replaces all CUDA-moving calls with no-ops
59
- so the model runs on CPU without modification.
 
 
 
 
60
  """
61
- # Save originals
62
- _tensor_cuda = torch.Tensor.cuda
63
- _module_cuda = torch.nn.Module.cuda
64
- _tensor_to = torch.Tensor.to
65
- _module_to = torch.nn.Module.to
66
 
67
- # Tensor.cuda() β†’ return self (stay on CPU)
 
 
 
 
 
 
 
68
  def _noop_tensor_cuda(self, device=None, *args, **kwargs):
69
  return self
70
 
71
- # Module.cuda() β†’ return self
72
  def _noop_module_cuda(self, device=None):
73
  return self
74
 
75
- # Tensor.to("cuda") / to(device) β†’ stay on CPU; allow dtype casts
 
76
  def _safe_tensor_to(self, *args, **kwargs):
77
- filtered = [
78
- a for a in args
79
- if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
80
- ]
81
  kwargs.pop("device", None)
82
- if filtered or kwargs:
83
- try:
84
- return _tensor_to(self, *filtered, **kwargs)
85
- except Exception:
86
- return self
87
- return self
88
 
89
- # Module.to("cuda") β†’ stay on CPU; allow dtype casts
90
  def _safe_module_to(self, *args, **kwargs):
91
- filtered = [
92
- a for a in args
93
- if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))
94
- ]
95
  kwargs.pop("device", None)
96
- if filtered or kwargs:
97
- try:
98
- return _module_to(self, *filtered, **kwargs)
99
- except Exception:
100
- return self
 
 
 
 
101
  return self
102
 
103
- torch.Tensor.cuda = _noop_tensor_cuda
104
- torch.nn.Module.cuda = _noop_module_cuda
105
- torch.Tensor.to = _safe_tensor_to
106
- torch.nn.Module.to = _safe_module_to
 
 
 
 
 
 
107
 
108
  try:
109
  yield
110
  finally:
111
- torch.Tensor.cuda = _tensor_cuda
112
- torch.nn.Module.cuda = _module_cuda
113
- torch.Tensor.to = _tensor_to
114
- torch.nn.Module.to = _module_to
 
 
115
 
116
 
117
  # ─── Core OCR inference ───────────────────────────────────────────────────────
 
37
  MODEL_NAME,
38
  _attn_implementation="eager",
39
  trust_remote_code=True,
40
+ torch_dtype=torch.bfloat16,
41
  )
42
  model.eval()
43
+ log.info("Model ready (cpu, bfloat16)")
44
  yield
45
  del model, tokenizer
46
 
 
54
  @contextmanager
55
  def force_cpu():
56
  """
57
+ DeepSeek-OCR-2's model.infer() has two CPU-breaking issues:
58
+ 1. Hardcodes .cuda() calls β†’ patched: .cuda() becomes a no-op
59
+ 2. Casts tensors to bfloat16 while model weights are float32
60
+ β†’ patched: bfloat16 requests are silently changed to float32
61
+ 3. Uses torch.autocast("cuda") which can still cast internally
62
+ β†’ patched: autocast is replaced with a no-op context manager
63
+ All patches are reverted after the 'with' block.
64
  """
65
+ import contextlib
 
 
 
 
66
 
67
+ _tensor_cuda = torch.Tensor.cuda
68
+ _module_cuda = torch.nn.Module.cuda
69
+ _tensor_to = torch.Tensor.to
70
+ _module_to = torch.nn.Module.to
71
+ _tensor_bf16 = torch.Tensor.bfloat16 # model may call .bfloat16() directly
72
+ _autocast = torch.autocast
73
+
74
+ # 1. .cuda() β†’ stay on CPU (no-op)
75
  def _noop_tensor_cuda(self, device=None, *args, **kwargs):
76
  return self
77
 
 
78
  def _noop_module_cuda(self, device=None):
79
  return self
80
 
81
+ # 2a. .to() β†’ strip CUDA device args; keep dtype as-is
82
+ # (model is loaded in bfloat16 so dtype is already consistent)
83
  def _safe_tensor_to(self, *args, **kwargs):
84
+ new_args = [a for a in args
85
+ if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
 
 
86
  kwargs.pop("device", None)
87
+ if not new_args and not kwargs:
88
+ return self
89
+ try:
90
+ return _tensor_to(self, *new_args, **kwargs)
91
+ except Exception:
92
+ return self
93
 
 
94
  def _safe_module_to(self, *args, **kwargs):
95
+ new_args = [a for a in args
96
+ if not (isinstance(a, (str, torch.device)) and "cuda" in str(a))]
 
 
97
  kwargs.pop("device", None)
98
+ if not new_args and not kwargs:
99
+ return self
100
+ try:
101
+ return _module_to(self, *new_args, **kwargs)
102
+ except Exception:
103
+ return self
104
+
105
+ # 2b. .bfloat16() direct calls β†’ no-op (tensor already in bfloat16)
106
+ def _noop_tensor_bf16(self):
107
  return self
108
 
109
+ # 3. torch.autocast("cuda", ...) β†’ nullcontext (no-op on CPU)
110
+ def _noop_autocast(*args, **kwargs):
111
+ return contextlib.nullcontext()
112
+
113
+ torch.Tensor.cuda = _noop_tensor_cuda
114
+ torch.nn.Module.cuda = _noop_module_cuda
115
+ torch.Tensor.to = _safe_tensor_to
116
+ torch.nn.Module.to = _safe_module_to
117
+ torch.Tensor.bfloat16 = _noop_tensor_bf16
118
+ torch.autocast = _noop_autocast
119
 
120
  try:
121
  yield
122
  finally:
123
+ torch.Tensor.cuda = _tensor_cuda
124
+ torch.nn.Module.cuda = _module_cuda
125
+ torch.Tensor.to = _tensor_to
126
+ torch.nn.Module.to = _module_to
127
+ torch.Tensor.bfloat16 = _tensor_bf16
128
+ torch.autocast = _autocast
129
 
130
 
131
  # ─── Core OCR inference ───────────────────────────────────────────────────────