DeepseekOCR

Sleeping

App Files Files Community

DeepseekOCR / app.py

defatul

Update app.py

b3c04d7 verified 10 days ago

raw

history blame contribute delete

12.1 kB

	import os
	# Disable CUDA paths before importing torch
	os.environ["CUDA_VISIBLE_DEVICES"] = ""
	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

	import numpy as np # IMPORTANT: must be before torch in some environments

	import torch
	import gradio as gr
	from transformers import AutoModel, AutoTokenizer

	import tempfile
	import shutil
	from PIL import Image, ImageDraw, ImageFont, ImageOps
	import fitz # PyMuPDF
	import re
	import base64
	from io import StringIO, BytesIO

	"""
	DeepSeek-OCR (CPU-only) Space app

	- No FlashAttention / no CUDA required.
	- Designed to run on Hugging Face CPU spaces (VERY SLOW).
	"""

	MODEL_NAME = "deepseek-ai/DeepSeek-OCR"

	# Keep CPU threads reasonable (optional)
	try:
	torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
	except Exception:
	pass

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

	model = AutoModel.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float32,
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval() # stays on CPU

	MODEL_CONFIGS = {
	"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
	"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
	"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
	"Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
	"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
	}

	TASK_PROMPTS = {
	"📋 Markdown": {"prompt": "<image>\n<\|grounding\|>Convert the document to markdown.", "has_grounding": True},
	"📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
	"📍 Locate": {"prompt": "<image>\nLocate <\|ref\|>text<\|/ref\|> in the image.", "has_grounding": True},
	"🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
	"✏️ Custom": {"prompt": "", "has_grounding": False},
	}

	def extract_grounding_references(text: str):
	pattern = r'(<\\|ref\\|>(.?)<\\|/ref\\|><\\|det\\|>(.?)<\\|/det\\|>)'
	return re.findall(pattern, text, re.DOTALL)

	def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
	img_w, img_h = image.size
	img_draw = image.copy()
	draw = ImageDraw.Draw(img_draw)
	overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
	draw2 = ImageDraw.Draw(overlay)

	font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
	try:
	font = ImageFont.truetype(font_path, 30)
	except Exception:
	font = ImageFont.load_default()

	crops = []
	color_map = {}
	np.random.seed(42)

	for ref in refs:
	label = ref[1]
	if label not in color_map:
	color_map[label] = (
	int(np.random.randint(50, 255)),
	int(np.random.randint(50, 255)),
	int(np.random.randint(50, 255)),
	)
	color = color_map[label]
	try:
	coords = eval(ref[2])
	except Exception:
	continue
	color_a = color + (60,)

	for box in coords:
	x1, y1, x2, y2 = (
	int(box[0] / 999 * img_w),
	int(box[1] / 999 * img_h),
	int(box[2] / 999 * img_w),
	int(box[3] / 999 * img_h),
	)

	if extract_images and label == "image":
	crops.append(image.crop((x1, y1, x2, y2)))

	width = 5 if label == "title" else 3
	draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
	draw2.rectangle([x1, y1, x2, y2], fill=color_a)

	try:
	text_bbox = draw.textbbox((0, 0), label, font=font)
	tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
	except Exception:
	tw, th = (len(label) * 10, 20)

	ty = max(0, y1 - 20)
	draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
	draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))

	img_draw.paste(overlay, (0, 0), overlay)
	return img_draw, crops

	def clean_output(text: str, include_images: bool = False) -> str:
	if not text:
	return ""
	pattern = r'(<\\|ref\\|>(.?)<\\|/ref\\|><\\|det\\|>(.?)<\\|/det\\|>)'
	matches = re.findall(pattern, text, re.DOTALL)
	img_num = 0

	for match in matches:
	if "<\|ref\|>image<\|/ref\|>" in match[0]:
	if include_images:
	text = text.replace(match[0], f"\n\n[Figure {img_num + 1}]\n\n", 1)
	img_num += 1
	else:
	text = text.replace(match[0], "", 1)
	else:
	text = re.sub(rf"(?m)^[^\n]{re.escape(match[0])}[^\n]\n?", "", text)

	return text.strip()

	def embed_images(markdown: str, crops):
	if not crops:
	return markdown
	for i, img in enumerate(crops):
	buf = BytesIO()
	img.save(buf, format="PNG")
	b64 = base64.b64encode(buf.getvalue()).decode()
	markdown = markdown.replace(
	f"[Figure {i + 1}]",
	f"\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n",
	1,
	)
	return markdown

	def infer_with_model(prompt: str, jpg_path: str, out_dir: str, base_size: int, image_size: int, crop_mode: bool) -> str:
	# DeepSeek model prints to stdout; capture it safely.
	import sys as _sys
	old_stdout = _sys.stdout
	_sys.stdout = StringIO()
	try:
	model.infer(
	tokenizer=tokenizer,
	prompt=prompt,
	image_file=jpg_path,
	output_path=out_dir,
	base_size=base_size,
	image_size=image_size,
	crop_mode=crop_mode,
	)
	raw = _sys.stdout.getvalue()
	finally:
	_sys.stdout = old_stdout
	return raw

	def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
	if image is None:
	return "Error: Upload image", "", "", None, []

	if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
	return "Error: Enter prompt", "", "", None, []

	if image.mode in ("RGBA", "LA", "P"):
	image = image.convert("RGB")
	image = ImageOps.exif_transpose(image)

	config = MODEL_CONFIGS[mode]

	if task == "✏️ Custom":
	prompt = f"<image>\n{custom_prompt.strip()}"
	has_grounding = "<\|grounding\|>" in custom_prompt
	elif task == "📍 Locate":
	prompt = f"<image>\nLocate <\|ref\|>{custom_prompt.strip()}<\|/ref\|> in the image."
	has_grounding = True
	else:
	prompt = TASK_PROMPTS[task]["prompt"]
	has_grounding = TASK_PROMPTS[task]["has_grounding"]

	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
	image.save(tmp.name, "JPEG", quality=95)
	tmp.close()
	out_dir = tempfile.mkdtemp()

	try:
	raw_stdout = infer_with_model(
	prompt=prompt,
	jpg_path=tmp.name,
	out_dir=out_dir,
	base_size=config["base_size"],
	image_size=config["image_size"],
	crop_mode=config["crop_mode"],
	)

	# Filter noisy lines (progress/debug)
	result = "\n".join(
	[
	l
	for l in raw_stdout.split("\n")
	if not any(
	s in l
	for s in [
	"image:",
	"other:",
	"PATCHES",
	"====",
	"BASE:",
	"%\|",
	"torch.Size",
	]
	)
	]
	).strip()

	if not result:
	return "No text", "", "", None, []

	cleaned = clean_output(result, False)
	markdown = clean_output(result, True)

	img_out = None
	crops = []

	if has_grounding and "<\|ref\|>" in result:
	refs = extract_grounding_references(result)
	if refs:
	img_out, crops = draw_bounding_boxes(image, refs, True)

	markdown = embed_images(markdown, crops)
	return cleaned, markdown, result, img_out, crops

	except Exception as e:
	return f"Runtime error: {type(e).__name__}: {e}", "", "", None, []
	finally:
	try:
	os.unlink(tmp.name)
	except Exception:
	pass
	shutil.rmtree(out_dir, ignore_errors=True)

	def process_pdf(path: str, mode: str, task: str, custom_prompt: str):
	doc = fitz.open(path)
	total_pages = len(doc)

	all_cleaned, all_markdown, all_raw, all_crops = [], [], [], []
	img_out = None

	try:
	for page_idx in range(total_pages):
	page = doc.load_page(page_idx)
	pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
	img = Image.open(BytesIO(pix.tobytes("png")))

	cleaned, markdown, raw, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)

	all_cleaned.append(cleaned)
	all_markdown.append(markdown)
	all_raw.append(raw)
	all_crops.extend(page_crops)

	if page_img_out is not None:
	img_out = page_img_out

	combined_cleaned = "\n\n--- Page Break ---\n\n".join(all_cleaned)
	combined_markdown = "\n\n--- Page Break ---\n\n".join(all_markdown)
	combined_raw = "\n\n--- Page Break ---\n\n".join(all_raw)
	return combined_cleaned, combined_markdown, combined_raw, img_out, all_crops
	finally:
	doc.close()

	def run(image, file_path, mode, task, custom_prompt):
	if file_path:
	if file_path.lower().endswith(".pdf"):
	return process_pdf(file_path, mode, task, custom_prompt)
	return process_image(Image.open(file_path), mode, task, custom_prompt)
	if image is not None:
	return process_image(image, mode, task, custom_prompt)
	return "Error: upload file or image", "", "", None, []

	def toggle_prompt(task):
	if task == "✏️ Custom":
	return gr.update(visible=True, label="Custom Prompt", placeholder="Add <\|grounding\|> for boxes")
	if task == "📍 Locate":
	return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
	return gr.update(visible=False)

	with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo:
	gr.Markdown(
	"""
	# 🐢 DeepSeek-OCR (CPU)

	⚠️ CPU is very slow and may fail on large images/PDFs due to RAM/time limits.
	Prefer Tiny/Small mode on CPU.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
	input_img = gr.Image(label="Input Image", type="pil", height=300)
	mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode")
	task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
	prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
	btn = gr.Button("Extract", variant="primary", size="lg")

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.Tab("Text"):
	text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
	with gr.Tab("Markdown Preview"):
	md_out = gr.Markdown("")
	with gr.Tab("Boxes"):
	img_out = gr.Image(type="pil", height=500, show_label=False)
	with gr.Tab("Cropped Images"):
	gallery = gr.Gallery(show_label=False, columns=3, height=400)
	with gr.Tab("Raw Text"):
	raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)

	task.change(toggle_prompt, [task], [prompt])

	btn.click(
	run,
	[input_img, file_in, mode, task, prompt],
	[text_out, md_out, raw_out, img_out, gallery],
	)

	if __name__ == "__main__":
	demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)