Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,35 +1,32 @@
|
|
| 1 |
import os
|
| 2 |
-
#
|
| 3 |
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
| 4 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from transformers import AutoModel, AutoTokenizer
|
| 8 |
-
|
| 9 |
import tempfile
|
| 10 |
import shutil
|
| 11 |
from PIL import Image, ImageDraw, ImageFont, ImageOps
|
| 12 |
import fitz # PyMuPDF
|
| 13 |
import re
|
| 14 |
-
import numpy as np
|
| 15 |
import base64
|
| 16 |
from io import StringIO, BytesIO
|
| 17 |
|
| 18 |
"""
|
| 19 |
DeepSeek-OCR (CPU-only) Space app
|
| 20 |
|
| 21 |
-
|
| 22 |
-
-
|
| 23 |
-
- Forces CPU-only PyTorch via requirements.txt
|
| 24 |
-
- Ensures CUDA is disabled before importing torch
|
| 25 |
-
|
| 26 |
-
Notes:
|
| 27 |
-
- DeepSeek-OCR is a large model. CPU will be VERY slow and may hit RAM/time limits on free hardware.
|
| 28 |
"""
|
| 29 |
|
| 30 |
MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
|
| 31 |
|
| 32 |
-
# Keep CPU threads reasonable (
|
| 33 |
try:
|
| 34 |
torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
|
| 35 |
except Exception:
|
|
@@ -37,7 +34,6 @@ except Exception:
|
|
| 37 |
|
| 38 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 39 |
|
| 40 |
-
# CPU-safe load: float32, no flash-attn args, no .cuda()
|
| 41 |
model = AutoModel.from_pretrained(
|
| 42 |
MODEL_NAME,
|
| 43 |
torch_dtype=torch.float32,
|
|
@@ -62,11 +58,11 @@ TASK_PROMPTS = {
|
|
| 62 |
"✏️ Custom": {"prompt": "", "has_grounding": False},
|
| 63 |
}
|
| 64 |
|
| 65 |
-
def extract_grounding_references(text):
|
| 66 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
| 67 |
return re.findall(pattern, text, re.DOTALL)
|
| 68 |
|
| 69 |
-
def draw_bounding_boxes(image, refs, extract_images=False):
|
| 70 |
img_w, img_h = image.size
|
| 71 |
img_draw = image.copy()
|
| 72 |
draw = ImageDraw.Draw(img_draw)
|
|
@@ -126,7 +122,7 @@ def draw_bounding_boxes(image, refs, extract_images=False):
|
|
| 126 |
img_draw.paste(overlay, (0, 0), overlay)
|
| 127 |
return img_draw, crops
|
| 128 |
|
| 129 |
-
def clean_output(text, include_images=False):
|
| 130 |
if not text:
|
| 131 |
return ""
|
| 132 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
|
@@ -145,7 +141,7 @@ def clean_output(text, include_images=False):
|
|
| 145 |
|
| 146 |
return text.strip()
|
| 147 |
|
| 148 |
-
def embed_images(markdown, crops):
|
| 149 |
if not crops:
|
| 150 |
return markdown
|
| 151 |
for i, img in enumerate(crops):
|
|
@@ -159,11 +155,10 @@ def embed_images(markdown, crops):
|
|
| 159 |
)
|
| 160 |
return markdown
|
| 161 |
|
| 162 |
-
def
|
| 163 |
-
# DeepSeek model prints to stdout; capture it.
|
| 164 |
-
stdout = torch.sys.stdout if hasattr(torch, "sys") else None
|
| 165 |
import sys as _sys
|
| 166 |
-
|
| 167 |
_sys.stdout = StringIO()
|
| 168 |
try:
|
| 169 |
model.infer(
|
|
@@ -177,10 +172,10 @@ def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mod
|
|
| 177 |
)
|
| 178 |
raw = _sys.stdout.getvalue()
|
| 179 |
finally:
|
| 180 |
-
_sys.stdout =
|
| 181 |
return raw
|
| 182 |
|
| 183 |
-
def process_image(image, mode, task, custom_prompt):
|
| 184 |
if image is None:
|
| 185 |
return "Error: Upload image", "", "", None, []
|
| 186 |
|
|
@@ -209,7 +204,7 @@ def process_image(image, mode, task, custom_prompt):
|
|
| 209 |
out_dir = tempfile.mkdtemp()
|
| 210 |
|
| 211 |
try:
|
| 212 |
-
raw_stdout =
|
| 213 |
prompt=prompt,
|
| 214 |
jpg_path=tmp.name,
|
| 215 |
out_dir=out_dir,
|
|
@@ -218,6 +213,7 @@ def process_image(image, mode, task, custom_prompt):
|
|
| 218 |
crop_mode=config["crop_mode"],
|
| 219 |
)
|
| 220 |
|
|
|
|
| 221 |
result = "\n".join(
|
| 222 |
[
|
| 223 |
l
|
|
@@ -263,7 +259,7 @@ def process_image(image, mode, task, custom_prompt):
|
|
| 263 |
pass
|
| 264 |
shutil.rmtree(out_dir, ignore_errors=True)
|
| 265 |
|
| 266 |
-
def process_pdf(path, mode, task, custom_prompt):
|
| 267 |
doc = fitz.open(path)
|
| 268 |
total_pages = len(doc)
|
| 269 |
|
|
@@ -276,14 +272,11 @@ def process_pdf(path, mode, task, custom_prompt):
|
|
| 276 |
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
|
| 277 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 278 |
|
| 279 |
-
cleaned, markdown,
|
| 280 |
-
|
| 281 |
-
if page_idx == 0 and (cleaned.startswith("Error") or cleaned == "No text"):
|
| 282 |
-
return cleaned, "", "", None, []
|
| 283 |
|
| 284 |
all_cleaned.append(cleaned)
|
| 285 |
all_markdown.append(markdown)
|
| 286 |
-
all_raw.append(
|
| 287 |
all_crops.extend(page_crops)
|
| 288 |
|
| 289 |
if page_img_out is not None:
|
|
@@ -317,7 +310,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo:
|
|
| 317 |
"""
|
| 318 |
# 🐢 DeepSeek-OCR (CPU)
|
| 319 |
|
| 320 |
-
⚠️
|
| 321 |
Prefer **Tiny/Small** mode on CPU.
|
| 322 |
"""
|
| 323 |
)
|
|
@@ -326,13 +319,13 @@ Prefer **Tiny/Small** mode on CPU.
|
|
| 326 |
with gr.Column(scale=1):
|
| 327 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 328 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 329 |
-
mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode
|
| 330 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
|
| 331 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 332 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 333 |
|
| 334 |
with gr.Column(scale=2):
|
| 335 |
-
with gr.Tabs()
|
| 336 |
with gr.Tab("Text"):
|
| 337 |
text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
|
| 338 |
with gr.Tab("Markdown Preview"):
|
|
@@ -344,7 +337,6 @@ Prefer **Tiny/Small** mode on CPU.
|
|
| 344 |
with gr.Tab("Raw Text"):
|
| 345 |
raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
|
| 346 |
|
| 347 |
-
file_in.change(lambda fp: Image.open(fp) if fp and not fp.lower().endswith(".pdf") else None, [file_in], [input_img])
|
| 348 |
task.change(toggle_prompt, [task], [prompt])
|
| 349 |
|
| 350 |
btn.click(
|
|
|
|
| 1 |
import os
|
| 2 |
+
# Disable CUDA paths before importing torch
|
| 3 |
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
| 4 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 5 |
|
| 6 |
+
import numpy as np # IMPORTANT: must be before torch in some environments
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
import gradio as gr
|
| 10 |
from transformers import AutoModel, AutoTokenizer
|
| 11 |
+
|
| 12 |
import tempfile
|
| 13 |
import shutil
|
| 14 |
from PIL import Image, ImageDraw, ImageFont, ImageOps
|
| 15 |
import fitz # PyMuPDF
|
| 16 |
import re
|
|
|
|
| 17 |
import base64
|
| 18 |
from io import StringIO, BytesIO
|
| 19 |
|
| 20 |
"""
|
| 21 |
DeepSeek-OCR (CPU-only) Space app
|
| 22 |
|
| 23 |
+
- No FlashAttention / no CUDA required.
|
| 24 |
+
- Designed to run on Hugging Face CPU spaces (VERY SLOW).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
|
| 27 |
MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
|
| 28 |
|
| 29 |
+
# Keep CPU threads reasonable (optional)
|
| 30 |
try:
|
| 31 |
torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
|
| 32 |
except Exception:
|
|
|
|
| 34 |
|
| 35 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 36 |
|
|
|
|
| 37 |
model = AutoModel.from_pretrained(
|
| 38 |
MODEL_NAME,
|
| 39 |
torch_dtype=torch.float32,
|
|
|
|
| 58 |
"✏️ Custom": {"prompt": "", "has_grounding": False},
|
| 59 |
}
|
| 60 |
|
| 61 |
+
def extract_grounding_references(text: str):
|
| 62 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
| 63 |
return re.findall(pattern, text, re.DOTALL)
|
| 64 |
|
| 65 |
+
def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
|
| 66 |
img_w, img_h = image.size
|
| 67 |
img_draw = image.copy()
|
| 68 |
draw = ImageDraw.Draw(img_draw)
|
|
|
|
| 122 |
img_draw.paste(overlay, (0, 0), overlay)
|
| 123 |
return img_draw, crops
|
| 124 |
|
| 125 |
+
def clean_output(text: str, include_images: bool = False) -> str:
|
| 126 |
if not text:
|
| 127 |
return ""
|
| 128 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
|
|
|
| 141 |
|
| 142 |
return text.strip()
|
| 143 |
|
| 144 |
+
def embed_images(markdown: str, crops):
|
| 145 |
if not crops:
|
| 146 |
return markdown
|
| 147 |
for i, img in enumerate(crops):
|
|
|
|
| 155 |
)
|
| 156 |
return markdown
|
| 157 |
|
| 158 |
+
def infer_with_model(prompt: str, jpg_path: str, out_dir: str, base_size: int, image_size: int, crop_mode: bool) -> str:
|
| 159 |
+
# DeepSeek model prints to stdout; capture it safely.
|
|
|
|
| 160 |
import sys as _sys
|
| 161 |
+
old_stdout = _sys.stdout
|
| 162 |
_sys.stdout = StringIO()
|
| 163 |
try:
|
| 164 |
model.infer(
|
|
|
|
| 172 |
)
|
| 173 |
raw = _sys.stdout.getvalue()
|
| 174 |
finally:
|
| 175 |
+
_sys.stdout = old_stdout
|
| 176 |
return raw
|
| 177 |
|
| 178 |
+
def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
|
| 179 |
if image is None:
|
| 180 |
return "Error: Upload image", "", "", None, []
|
| 181 |
|
|
|
|
| 204 |
out_dir = tempfile.mkdtemp()
|
| 205 |
|
| 206 |
try:
|
| 207 |
+
raw_stdout = infer_with_model(
|
| 208 |
prompt=prompt,
|
| 209 |
jpg_path=tmp.name,
|
| 210 |
out_dir=out_dir,
|
|
|
|
| 213 |
crop_mode=config["crop_mode"],
|
| 214 |
)
|
| 215 |
|
| 216 |
+
# Filter noisy lines (progress/debug)
|
| 217 |
result = "\n".join(
|
| 218 |
[
|
| 219 |
l
|
|
|
|
| 259 |
pass
|
| 260 |
shutil.rmtree(out_dir, ignore_errors=True)
|
| 261 |
|
| 262 |
+
def process_pdf(path: str, mode: str, task: str, custom_prompt: str):
|
| 263 |
doc = fitz.open(path)
|
| 264 |
total_pages = len(doc)
|
| 265 |
|
|
|
|
| 272 |
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
|
| 273 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 274 |
|
| 275 |
+
cleaned, markdown, raw, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
all_cleaned.append(cleaned)
|
| 278 |
all_markdown.append(markdown)
|
| 279 |
+
all_raw.append(raw)
|
| 280 |
all_crops.extend(page_crops)
|
| 281 |
|
| 282 |
if page_img_out is not None:
|
|
|
|
| 310 |
"""
|
| 311 |
# 🐢 DeepSeek-OCR (CPU)
|
| 312 |
|
| 313 |
+
⚠️ CPU is **very slow** and may fail on large images/PDFs due to RAM/time limits.
|
| 314 |
Prefer **Tiny/Small** mode on CPU.
|
| 315 |
"""
|
| 316 |
)
|
|
|
|
| 319 |
with gr.Column(scale=1):
|
| 320 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 321 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 322 |
+
mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode")
|
| 323 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
|
| 324 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 325 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 326 |
|
| 327 |
with gr.Column(scale=2):
|
| 328 |
+
with gr.Tabs():
|
| 329 |
with gr.Tab("Text"):
|
| 330 |
text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
|
| 331 |
with gr.Tab("Markdown Preview"):
|
|
|
|
| 337 |
with gr.Tab("Raw Text"):
|
| 338 |
raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
|
| 339 |
|
|
|
|
| 340 |
task.change(toggle_prompt, [task], [prompt])
|
| 341 |
|
| 342 |
btn.click(
|