defatul commited on
Commit
b3c04d7
·
verified ·
1 Parent(s): 9e97e45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -1,35 +1,32 @@
1
  import os
2
- # Hard-disable CUDA paths BEFORE importing torch/transformers
3
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
4
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
5
 
 
 
 
6
  import gradio as gr
7
  from transformers import AutoModel, AutoTokenizer
8
- import torch
9
  import tempfile
10
  import shutil
11
  from PIL import Image, ImageDraw, ImageFont, ImageOps
12
  import fitz # PyMuPDF
13
  import re
14
- import numpy as np
15
  import base64
16
  from io import StringIO, BytesIO
17
 
18
  """
19
  DeepSeek-OCR (CPU-only) Space app
20
 
21
- What this fixes:
22
- - No FlashAttention2 / no CUDA required
23
- - Forces CPU-only PyTorch via requirements.txt
24
- - Ensures CUDA is disabled before importing torch
25
-
26
- Notes:
27
- - DeepSeek-OCR is a large model. CPU will be VERY slow and may hit RAM/time limits on free hardware.
28
  """
29
 
30
  MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
31
 
32
- # Keep CPU threads reasonable (tweak if you want)
33
  try:
34
  torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
35
  except Exception:
@@ -37,7 +34,6 @@ except Exception:
37
 
38
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
39
 
40
- # CPU-safe load: float32, no flash-attn args, no .cuda()
41
  model = AutoModel.from_pretrained(
42
  MODEL_NAME,
43
  torch_dtype=torch.float32,
@@ -62,11 +58,11 @@ TASK_PROMPTS = {
62
  "✏️ Custom": {"prompt": "", "has_grounding": False},
63
  }
64
 
65
- def extract_grounding_references(text):
66
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
67
  return re.findall(pattern, text, re.DOTALL)
68
 
69
- def draw_bounding_boxes(image, refs, extract_images=False):
70
  img_w, img_h = image.size
71
  img_draw = image.copy()
72
  draw = ImageDraw.Draw(img_draw)
@@ -126,7 +122,7 @@ def draw_bounding_boxes(image, refs, extract_images=False):
126
  img_draw.paste(overlay, (0, 0), overlay)
127
  return img_draw, crops
128
 
129
- def clean_output(text, include_images=False):
130
  if not text:
131
  return ""
132
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
@@ -145,7 +141,7 @@ def clean_output(text, include_images=False):
145
 
146
  return text.strip()
147
 
148
- def embed_images(markdown, crops):
149
  if not crops:
150
  return markdown
151
  for i, img in enumerate(crops):
@@ -159,11 +155,10 @@ def embed_images(markdown, crops):
159
  )
160
  return markdown
161
 
162
- def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mode):
163
- # DeepSeek model prints to stdout; capture it.
164
- stdout = torch.sys.stdout if hasattr(torch, "sys") else None
165
  import sys as _sys
166
- old = _sys.stdout
167
  _sys.stdout = StringIO()
168
  try:
169
  model.infer(
@@ -177,10 +172,10 @@ def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mod
177
  )
178
  raw = _sys.stdout.getvalue()
179
  finally:
180
- _sys.stdout = old
181
  return raw
182
 
183
- def process_image(image, mode, task, custom_prompt):
184
  if image is None:
185
  return "Error: Upload image", "", "", None, []
186
 
@@ -209,7 +204,7 @@ def process_image(image, mode, task, custom_prompt):
209
  out_dir = tempfile.mkdtemp()
210
 
211
  try:
212
- raw_stdout = _infer_with_model(
213
  prompt=prompt,
214
  jpg_path=tmp.name,
215
  out_dir=out_dir,
@@ -218,6 +213,7 @@ def process_image(image, mode, task, custom_prompt):
218
  crop_mode=config["crop_mode"],
219
  )
220
 
 
221
  result = "\n".join(
222
  [
223
  l
@@ -263,7 +259,7 @@ def process_image(image, mode, task, custom_prompt):
263
  pass
264
  shutil.rmtree(out_dir, ignore_errors=True)
265
 
266
- def process_pdf(path, mode, task, custom_prompt):
267
  doc = fitz.open(path)
268
  total_pages = len(doc)
269
 
@@ -276,14 +272,11 @@ def process_pdf(path, mode, task, custom_prompt):
276
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
277
  img = Image.open(BytesIO(pix.tobytes("png")))
278
 
279
- cleaned, markdown, result, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
280
-
281
- if page_idx == 0 and (cleaned.startswith("Error") or cleaned == "No text"):
282
- return cleaned, "", "", None, []
283
 
284
  all_cleaned.append(cleaned)
285
  all_markdown.append(markdown)
286
- all_raw.append(result)
287
  all_crops.extend(page_crops)
288
 
289
  if page_img_out is not None:
@@ -317,7 +310,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo:
317
  """
318
  # 🐢 DeepSeek-OCR (CPU)
319
 
320
- ⚠️ **CPU is very slow** and may fail on large images/PDFs due to RAM/time limits.
321
  Prefer **Tiny/Small** mode on CPU.
322
  """
323
  )
@@ -326,13 +319,13 @@ Prefer **Tiny/Small** mode on CPU.
326
  with gr.Column(scale=1):
327
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
328
  input_img = gr.Image(label="Input Image", type="pil", height=300)
329
- mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode (CPU recommend: Tiny/Small)")
330
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
331
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
332
  btn = gr.Button("Extract", variant="primary", size="lg")
333
 
334
  with gr.Column(scale=2):
335
- with gr.Tabs() as tabs:
336
  with gr.Tab("Text"):
337
  text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
338
  with gr.Tab("Markdown Preview"):
@@ -344,7 +337,6 @@ Prefer **Tiny/Small** mode on CPU.
344
  with gr.Tab("Raw Text"):
345
  raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
346
 
347
- file_in.change(lambda fp: Image.open(fp) if fp and not fp.lower().endswith(".pdf") else None, [file_in], [input_img])
348
  task.change(toggle_prompt, [task], [prompt])
349
 
350
  btn.click(
 
1
  import os
2
+ # Disable CUDA paths before importing torch
3
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
4
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
5
 
6
+ import numpy as np # IMPORTANT: must be before torch in some environments
7
+
8
+ import torch
9
  import gradio as gr
10
  from transformers import AutoModel, AutoTokenizer
11
+
12
  import tempfile
13
  import shutil
14
  from PIL import Image, ImageDraw, ImageFont, ImageOps
15
  import fitz # PyMuPDF
16
  import re
 
17
  import base64
18
  from io import StringIO, BytesIO
19
 
20
  """
21
  DeepSeek-OCR (CPU-only) Space app
22
 
23
+ - No FlashAttention / no CUDA required.
24
+ - Designed to run on Hugging Face CPU spaces (VERY SLOW).
 
 
 
 
 
25
  """
26
 
27
  MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
28
 
29
+ # Keep CPU threads reasonable (optional)
30
  try:
31
  torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
32
  except Exception:
 
34
 
35
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
36
 
 
37
  model = AutoModel.from_pretrained(
38
  MODEL_NAME,
39
  torch_dtype=torch.float32,
 
58
  "✏️ Custom": {"prompt": "", "has_grounding": False},
59
  }
60
 
61
+ def extract_grounding_references(text: str):
62
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
63
  return re.findall(pattern, text, re.DOTALL)
64
 
65
+ def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
66
  img_w, img_h = image.size
67
  img_draw = image.copy()
68
  draw = ImageDraw.Draw(img_draw)
 
122
  img_draw.paste(overlay, (0, 0), overlay)
123
  return img_draw, crops
124
 
125
+ def clean_output(text: str, include_images: bool = False) -> str:
126
  if not text:
127
  return ""
128
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
 
141
 
142
  return text.strip()
143
 
144
+ def embed_images(markdown: str, crops):
145
  if not crops:
146
  return markdown
147
  for i, img in enumerate(crops):
 
155
  )
156
  return markdown
157
 
158
+ def infer_with_model(prompt: str, jpg_path: str, out_dir: str, base_size: int, image_size: int, crop_mode: bool) -> str:
159
+ # DeepSeek model prints to stdout; capture it safely.
 
160
  import sys as _sys
161
+ old_stdout = _sys.stdout
162
  _sys.stdout = StringIO()
163
  try:
164
  model.infer(
 
172
  )
173
  raw = _sys.stdout.getvalue()
174
  finally:
175
+ _sys.stdout = old_stdout
176
  return raw
177
 
178
+ def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
179
  if image is None:
180
  return "Error: Upload image", "", "", None, []
181
 
 
204
  out_dir = tempfile.mkdtemp()
205
 
206
  try:
207
+ raw_stdout = infer_with_model(
208
  prompt=prompt,
209
  jpg_path=tmp.name,
210
  out_dir=out_dir,
 
213
  crop_mode=config["crop_mode"],
214
  )
215
 
216
+ # Filter noisy lines (progress/debug)
217
  result = "\n".join(
218
  [
219
  l
 
259
  pass
260
  shutil.rmtree(out_dir, ignore_errors=True)
261
 
262
+ def process_pdf(path: str, mode: str, task: str, custom_prompt: str):
263
  doc = fitz.open(path)
264
  total_pages = len(doc)
265
 
 
272
  pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
273
  img = Image.open(BytesIO(pix.tobytes("png")))
274
 
275
+ cleaned, markdown, raw, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
 
 
 
276
 
277
  all_cleaned.append(cleaned)
278
  all_markdown.append(markdown)
279
+ all_raw.append(raw)
280
  all_crops.extend(page_crops)
281
 
282
  if page_img_out is not None:
 
310
  """
311
  # 🐢 DeepSeek-OCR (CPU)
312
 
313
+ ⚠️ CPU is **very slow** and may fail on large images/PDFs due to RAM/time limits.
314
  Prefer **Tiny/Small** mode on CPU.
315
  """
316
  )
 
319
  with gr.Column(scale=1):
320
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
321
  input_img = gr.Image(label="Input Image", type="pil", height=300)
322
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode")
323
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
324
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
325
  btn = gr.Button("Extract", variant="primary", size="lg")
326
 
327
  with gr.Column(scale=2):
328
+ with gr.Tabs():
329
  with gr.Tab("Text"):
330
  text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
331
  with gr.Tab("Markdown Preview"):
 
337
  with gr.Tab("Raw Text"):
338
  raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
339
 
 
340
  task.change(toggle_prompt, [task], [prompt])
341
 
342
  btn.click(