import os # Disable CUDA paths before importing torch os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import numpy as np # IMPORTANT: must be before torch in some environments import torch import gradio as gr from transformers import AutoModel, AutoTokenizer import tempfile import shutil from PIL import Image, ImageDraw, ImageFont, ImageOps import fitz # PyMuPDF import re import base64 from io import StringIO, BytesIO """ DeepSeek-OCR (CPU-only) Space app - No FlashAttention / no CUDA required. - Designed to run on Hugging Face CPU spaces (VERY SLOW). """ MODEL_NAME = "deepseek-ai/DeepSeek-OCR" # Keep CPU threads reasonable (optional) try: torch.set_num_threads(max(1, min(8, os.cpu_count() or 1))) except Exception: pass tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModel.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, trust_remote_code=True, use_safetensors=True, ) model = model.eval() # stays on CPU MODEL_CONFIGS = { "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, "Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True}, "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, } TASK_PROMPTS = { "📋 Markdown": {"prompt": "\n<|grounding|>Convert the document to markdown.", "has_grounding": True}, "📝 Free OCR": {"prompt": "\nFree OCR.", "has_grounding": False}, "📍 Locate": {"prompt": "\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True}, "🔍 Describe": {"prompt": "\nDescribe this image in detail.", "has_grounding": False}, "✏️ Custom": {"prompt": "", "has_grounding": False}, } def extract_grounding_references(text: str): pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)' return re.findall(pattern, text, re.DOTALL) def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False): img_w, img_h = image.size img_draw = image.copy() draw = ImageDraw.Draw(img_draw) overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0)) draw2 = ImageDraw.Draw(overlay) font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" try: font = ImageFont.truetype(font_path, 30) except Exception: font = ImageFont.load_default() crops = [] color_map = {} np.random.seed(42) for ref in refs: label = ref[1] if label not in color_map: color_map[label] = ( int(np.random.randint(50, 255)), int(np.random.randint(50, 255)), int(np.random.randint(50, 255)), ) color = color_map[label] try: coords = eval(ref[2]) except Exception: continue color_a = color + (60,) for box in coords: x1, y1, x2, y2 = ( int(box[0] / 999 * img_w), int(box[1] / 999 * img_h), int(box[2] / 999 * img_w), int(box[3] / 999 * img_h), ) if extract_images and label == "image": crops.append(image.crop((x1, y1, x2, y2))) width = 5 if label == "title" else 3 draw.rectangle([x1, y1, x2, y2], outline=color, width=width) draw2.rectangle([x1, y1, x2, y2], fill=color_a) try: text_bbox = draw.textbbox((0, 0), label, font=font) tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1] except Exception: tw, th = (len(label) * 10, 20) ty = max(0, y1 - 20) draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color) draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255)) img_draw.paste(overlay, (0, 0), overlay) return img_draw, crops def clean_output(text: str, include_images: bool = False) -> str: if not text: return "" pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)' matches = re.findall(pattern, text, re.DOTALL) img_num = 0 for match in matches: if "<|ref|>image<|/ref|>" in match[0]: if include_images: text = text.replace(match[0], f"\n\n**[Figure {img_num + 1}]**\n\n", 1) img_num += 1 else: text = text.replace(match[0], "", 1) else: text = re.sub(rf"(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?", "", text) return text.strip() def embed_images(markdown: str, crops): if not crops: return markdown for i, img in enumerate(crops): buf = BytesIO() img.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() markdown = markdown.replace( f"**[Figure {i + 1}]**", f"\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n", 1, ) return markdown def infer_with_model(prompt: str, jpg_path: str, out_dir: str, base_size: int, image_size: int, crop_mode: bool) -> str: # DeepSeek model prints to stdout; capture it safely. import sys as _sys old_stdout = _sys.stdout _sys.stdout = StringIO() try: model.infer( tokenizer=tokenizer, prompt=prompt, image_file=jpg_path, output_path=out_dir, base_size=base_size, image_size=image_size, crop_mode=crop_mode, ) raw = _sys.stdout.getvalue() finally: _sys.stdout = old_stdout return raw def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str): if image is None: return "Error: Upload image", "", "", None, [] if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip(): return "Error: Enter prompt", "", "", None, [] if image.mode in ("RGBA", "LA", "P"): image = image.convert("RGB") image = ImageOps.exif_transpose(image) config = MODEL_CONFIGS[mode] if task == "✏️ Custom": prompt = f"\n{custom_prompt.strip()}" has_grounding = "<|grounding|>" in custom_prompt elif task == "📍 Locate": prompt = f"\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image." has_grounding = True else: prompt = TASK_PROMPTS[task]["prompt"] has_grounding = TASK_PROMPTS[task]["has_grounding"] tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") image.save(tmp.name, "JPEG", quality=95) tmp.close() out_dir = tempfile.mkdtemp() try: raw_stdout = infer_with_model( prompt=prompt, jpg_path=tmp.name, out_dir=out_dir, base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"], ) # Filter noisy lines (progress/debug) result = "\n".join( [ l for l in raw_stdout.split("\n") if not any( s in l for s in [ "image:", "other:", "PATCHES", "====", "BASE:", "%|", "torch.Size", ] ) ] ).strip() if not result: return "No text", "", "", None, [] cleaned = clean_output(result, False) markdown = clean_output(result, True) img_out = None crops = [] if has_grounding and "<|ref|>" in result: refs = extract_grounding_references(result) if refs: img_out, crops = draw_bounding_boxes(image, refs, True) markdown = embed_images(markdown, crops) return cleaned, markdown, result, img_out, crops except Exception as e: return f"Runtime error: {type(e).__name__}: {e}", "", "", None, [] finally: try: os.unlink(tmp.name) except Exception: pass shutil.rmtree(out_dir, ignore_errors=True) def process_pdf(path: str, mode: str, task: str, custom_prompt: str): doc = fitz.open(path) total_pages = len(doc) all_cleaned, all_markdown, all_raw, all_crops = [], [], [], [] img_out = None try: for page_idx in range(total_pages): page = doc.load_page(page_idx) pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False) img = Image.open(BytesIO(pix.tobytes("png"))) cleaned, markdown, raw, page_img_out, page_crops = process_image(img, mode, task, custom_prompt) all_cleaned.append(cleaned) all_markdown.append(markdown) all_raw.append(raw) all_crops.extend(page_crops) if page_img_out is not None: img_out = page_img_out combined_cleaned = "\n\n--- Page Break ---\n\n".join(all_cleaned) combined_markdown = "\n\n--- Page Break ---\n\n".join(all_markdown) combined_raw = "\n\n--- Page Break ---\n\n".join(all_raw) return combined_cleaned, combined_markdown, combined_raw, img_out, all_crops finally: doc.close() def run(image, file_path, mode, task, custom_prompt): if file_path: if file_path.lower().endswith(".pdf"): return process_pdf(file_path, mode, task, custom_prompt) return process_image(Image.open(file_path), mode, task, custom_prompt) if image is not None: return process_image(image, mode, task, custom_prompt) return "Error: upload file or image", "", "", None, [] def toggle_prompt(task): if task == "✏️ Custom": return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes") if task == "📍 Locate": return gr.update(visible=True, label="Text to Locate", placeholder="Enter text") return gr.update(visible=False) with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo: gr.Markdown( """ # 🐢 DeepSeek-OCR (CPU) ⚠️ CPU is **very slow** and may fail on large images/PDFs due to RAM/time limits. Prefer **Tiny/Small** mode on CPU. """ ) with gr.Row(): with gr.Column(scale=1): file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath") input_img = gr.Image(label="Input Image", type="pil", height=300) mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode") task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task") prompt = gr.Textbox(label="Prompt", lines=2, visible=False) btn = gr.Button("Extract", variant="primary", size="lg") with gr.Column(scale=2): with gr.Tabs(): with gr.Tab("Text"): text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False) with gr.Tab("Markdown Preview"): md_out = gr.Markdown("") with gr.Tab("Boxes"): img_out = gr.Image(type="pil", height=500, show_label=False) with gr.Tab("Cropped Images"): gallery = gr.Gallery(show_label=False, columns=3, height=400) with gr.Tab("Raw Text"): raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False) task.change(toggle_prompt, [task], [prompt]) btn.click( run, [input_img, file_in, mode, task, prompt], [text_out, md_out, raw_out, img_out, gallery], ) if __name__ == "__main__": demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)