DeepseekOCR

Sleeping

App Files Files Community

defatul commited on 19 days ago

Commit

b3c04d7

verified ·

1 Parent(s): 9e97e45

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -33

app.py CHANGED Viewed

@@ -1,35 +1,32 @@
 import os
-# Hard-disable CUDA paths BEFORE importing torch/transformers
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 import gradio as gr
 from transformers import AutoModel, AutoTokenizer
-import torch
 import tempfile
 import shutil
 from PIL import Image, ImageDraw, ImageFont, ImageOps
 import fitz  # PyMuPDF
 import re
-import numpy as np
 import base64
 from io import StringIO, BytesIO
 """
 DeepSeek-OCR (CPU-only) Space app
-What this fixes:
-- No FlashAttention2 / no CUDA required
-- Forces CPU-only PyTorch via requirements.txt
-- Ensures CUDA is disabled before importing torch
-Notes:
-- DeepSeek-OCR is a large model. CPU will be VERY slow and may hit RAM/time limits on free hardware.
 """
 MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
-# Keep CPU threads reasonable (tweak if you want)
 try:
     torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
 except Exception:
@@ -37,7 +34,6 @@ except Exception:
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-# CPU-safe load: float32, no flash-attn args, no .cuda()
 model = AutoModel.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch.float32,
@@ -62,11 +58,11 @@ TASK_PROMPTS = {
     "✏️ Custom": {"prompt": "", "has_grounding": False},
 }
-def extract_grounding_references(text):
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return re.findall(pattern, text, re.DOTALL)
-def draw_bounding_boxes(image, refs, extract_images=False):
     img_w, img_h = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
@@ -126,7 +122,7 @@ def draw_bounding_boxes(image, refs, extract_images=False):
     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
-def clean_output(text, include_images=False):
     if not text:
         return ""
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
@@ -145,7 +141,7 @@ def clean_output(text, include_images=False):
     return text.strip()
-def embed_images(markdown, crops):
     if not crops:
         return markdown
     for i, img in enumerate(crops):
@@ -159,11 +155,10 @@ def embed_images(markdown, crops):
         )
     return markdown
-def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mode):
-    # DeepSeek model prints to stdout; capture it.
-    stdout = torch.sys.stdout if hasattr(torch, "sys") else None
     import sys as _sys
-    old = _sys.stdout
     _sys.stdout = StringIO()
     try:
         model.infer(
@@ -177,10 +172,10 @@ def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mod
         )
         raw = _sys.stdout.getvalue()
     finally:
-        _sys.stdout = old
     return raw
-def process_image(image, mode, task, custom_prompt):
     if image is None:
         return "Error: Upload image", "", "", None, []
@@ -209,7 +204,7 @@ def process_image(image, mode, task, custom_prompt):
     out_dir = tempfile.mkdtemp()
     try:
-        raw_stdout = _infer_with_model(
             prompt=prompt,
             jpg_path=tmp.name,
             out_dir=out_dir,
@@ -218,6 +213,7 @@ def process_image(image, mode, task, custom_prompt):
             crop_mode=config["crop_mode"],
         )
         result = "\n".join(
             [
                 l
@@ -263,7 +259,7 @@ def process_image(image, mode, task, custom_prompt):
             pass
         shutil.rmtree(out_dir, ignore_errors=True)
-def process_pdf(path, mode, task, custom_prompt):
     doc = fitz.open(path)
     total_pages = len(doc)
@@ -276,14 +272,11 @@ def process_pdf(path, mode, task, custom_prompt):
             pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
             img = Image.open(BytesIO(pix.tobytes("png")))
-            cleaned, markdown, result, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
-            if page_idx == 0 and (cleaned.startswith("Error") or cleaned == "No text"):
-                return cleaned, "", "", None, []
             all_cleaned.append(cleaned)
             all_markdown.append(markdown)
-            all_raw.append(result)
             all_crops.extend(page_crops)
             if page_img_out is not None:
@@ -317,7 +310,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo:
         """
 # 🐢 DeepSeek-OCR (CPU)
-⚠️ **CPU is very slow** and may fail on large images/PDFs due to RAM/time limits.
 Prefer **Tiny/Small** mode on CPU.
 """
     )
@@ -326,13 +319,13 @@ Prefer **Tiny/Small** mode on CPU.
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             input_img = gr.Image(label="Input Image", type="pil", height=300)
-            mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode (CPU recommend: Tiny/Small)")
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
-            with gr.Tabs() as tabs:
                 with gr.Tab("Text"):
                     text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
                 with gr.Tab("Markdown Preview"):
@@ -344,7 +337,6 @@ Prefer **Tiny/Small** mode on CPU.
                 with gr.Tab("Raw Text"):
                     raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
-    file_in.change(lambda fp: Image.open(fp) if fp and not fp.lower().endswith(".pdf") else None, [file_in], [input_img])
     task.change(toggle_prompt, [task], [prompt])
     btn.click(

 import os
+# Disable CUDA paths before importing torch
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+import numpy as np  # IMPORTANT: must be before torch in some environments
+import torch
 import gradio as gr
 from transformers import AutoModel, AutoTokenizer
 import tempfile
 import shutil
 from PIL import Image, ImageDraw, ImageFont, ImageOps
 import fitz  # PyMuPDF
 import re
 import base64
 from io import StringIO, BytesIO
 """
 DeepSeek-OCR (CPU-only) Space app
+- No FlashAttention / no CUDA required.
+- Designed to run on Hugging Face CPU spaces (VERY SLOW).
 """
 MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
+# Keep CPU threads reasonable (optional)
 try:
     torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
 except Exception:
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch.float32,
     "✏️ Custom": {"prompt": "", "has_grounding": False},
 }
+def extract_grounding_references(text: str):
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return re.findall(pattern, text, re.DOTALL)
+def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
     img_w, img_h = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
+def clean_output(text: str, include_images: bool = False) -> str:
     if not text:
         return ""
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return text.strip()
+def embed_images(markdown: str, crops):
     if not crops:
         return markdown
     for i, img in enumerate(crops):
         )
     return markdown
+def infer_with_model(prompt: str, jpg_path: str, out_dir: str, base_size: int, image_size: int, crop_mode: bool) -> str:
+    # DeepSeek model prints to stdout; capture it safely.
     import sys as _sys
+    old_stdout = _sys.stdout
     _sys.stdout = StringIO()
     try:
         model.infer(
         )
         raw = _sys.stdout.getvalue()
     finally:
+        _sys.stdout = old_stdout
     return raw
+def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
     if image is None:
         return "Error: Upload image", "", "", None, []
     out_dir = tempfile.mkdtemp()
     try:
+        raw_stdout = infer_with_model(
             prompt=prompt,
             jpg_path=tmp.name,
             out_dir=out_dir,
             crop_mode=config["crop_mode"],
         )
+        # Filter noisy lines (progress/debug)
         result = "\n".join(
             [
                 l
             pass
         shutil.rmtree(out_dir, ignore_errors=True)
+def process_pdf(path: str, mode: str, task: str, custom_prompt: str):
     doc = fitz.open(path)
     total_pages = len(doc)
             pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
             img = Image.open(BytesIO(pix.tobytes("png")))
+            cleaned, markdown, raw, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
             all_cleaned.append(cleaned)
             all_markdown.append(markdown)
+            all_raw.append(raw)
             all_crops.extend(page_crops)
             if page_img_out is not None:
         """
 # 🐢 DeepSeek-OCR (CPU)
+⚠️ CPU is **very slow** and may fail on large images/PDFs due to RAM/time limits.
 Prefer **Tiny/Small** mode on CPU.
 """
     )
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             input_img = gr.Image(label="Input Image", type="pil", height=300)
+            mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode")
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
+            with gr.Tabs():
                 with gr.Tab("Text"):
                     text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
                 with gr.Tab("Markdown Preview"):
                 with gr.Tab("Raw Text"):
                     raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
     task.change(toggle_prompt, [task], [prompt])
     btn.click(