Spaces:

UnMelow
/

422_tasks

Running

App Files Files Community

UnMelow commited on 23 days ago

Commit

4ce4fa4

verified ·

1 Parent(s): 9609f9b

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -119

app.py CHANGED Viewed

@@ -1,256 +1,539 @@
-import gradio as gr
-from transformers import AutoModel, AutoTokenizer
-import torch
-import spaces
 import os
 import sys
-import tempfile
-import shutil
-from PIL import Image, ImageDraw, ImageFont, ImageOps
-import fitz
 import re
 import warnings
-import numpy as np
 import base64
 from io import StringIO, BytesIO
-MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
-model = model.eval().cuda()
 MODEL_CONFIGS = {
     "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
     "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
     "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
     "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-    "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
 }
 TASK_PROMPTS = {
-    "📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
-    "📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
-    "📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
-    "🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
-    "✏️ Custom": {"prompt": "", "has_grounding": False}
 }
-def extract_grounding_references(text):
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return re.findall(pattern, text, re.DOTALL)
-def draw_bounding_boxes(image, refs, extract_images=False):
     img_w, img_h = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
-    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
     draw2 = ImageDraw.Draw(overlay)
-    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
     crops = []
     color_map = {}
     np.random.seed(42)
     for ref in refs:
         label = ref[1]
         if label not in color_map:
-            color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
         color = color_map[label]
-        coords = eval(ref[2])
         color_a = color + (60,)
         for box in coords:
-            x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
-            if extract_images and label == 'image':
                 crops.append(image.crop((x1, y1, x2, y2)))
-            width = 5 if label == 'title' else 3
             draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
             draw2.rectangle([x1, y1, x2, y2], fill=color_a)
             text_bbox = draw.textbbox((0, 0), label, font=font)
             tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
             ty = max(0, y1 - 20)
             draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
             draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
-def clean_output(text, include_images=False):
     if not text:
         return ""
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     matches = re.findall(pattern, text, re.DOTALL)
     img_num = 0
     for match in matches:
-        if '<|ref|>image<|/ref|>' in match[0]:
             if include_images:
-                text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
                 img_num += 1
             else:
-                text = text.replace(match[0], '', 1)
         else:
-            text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
     return text.strip()
-def embed_images(markdown, crops):
     if not crops:
         return markdown
     for i, img in enumerate(crops):
         buf = BytesIO()
         img.save(buf, format="PNG")
         b64 = base64.b64encode(buf.getvalue()).decode()
-        markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
     return markdown
-@spaces.GPU(duration=60)
-def process_image(image, mode, task, custom_prompt):
     if image is None:
-        return " Error Upload image", "", "", None, []
     if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
-        return "Enter prompt", "", "", None, []
-    if image.mode in ('RGBA', 'LA', 'P'):
-        image = image.convert('RGB')
     image = ImageOps.exif_transpose(image)
     config = MODEL_CONFIGS[mode]
     if task == "✏️ Custom":
         prompt = f"<image>\n{custom_prompt.strip()}"
-        has_grounding = '<|grounding|>' in custom_prompt
     elif task == "📍 Locate":
         prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
         has_grounding = True
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
-    image.save(tmp.name, 'JPEG', quality=95)
     tmp.close()
     out_dir = tempfile.mkdtemp()
     stdout = sys.stdout
     sys.stdout = StringIO()
-    model.infer(tokenizer=tokenizer, prompt=prompt, image_file=tmp.name, output_path=out_dir,
-                base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
-    result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
-                        if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
-    sys.stdout = stdout
-    os.unlink(tmp.name)
-    shutil.rmtree(out_dir, ignore_errors=True)
     if not result:
         return "No text", "", "", None, []
-    cleaned = clean_output(result, False)
-    markdown = clean_output(result, True)
     img_out = None
     crops = []
-    if has_grounding and '<|ref|>' in result:
         refs = extract_grounding_references(result)
         if refs:
-            img_out, crops = draw_bounding_boxes(image, refs, True)
     markdown = embed_images(markdown, crops)
     return cleaned, markdown, result, img_out, crops
-@spaces.GPU(duration=60)
-def process_pdf(path, mode, task, custom_prompt, page_num):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
         doc.close()
         return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
     page = doc.load_page(page_num - 1)
-    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
     img = Image.open(BytesIO(pix.tobytes("png")))
     doc.close()
     return process_image(img, mode, task, custom_prompt)
-def process_file(path, mode, task, custom_prompt, page_num):
     if not path:
-        return "Error Upload file", "", "", None, []
-    if path.lower().endswith('.pdf'):
         return process_pdf(path, mode, task, custom_prompt, page_num)
-    else:
-        return process_image(Image.open(path), mode, task, custom_prompt)
-def toggle_prompt(task):
     if task == "✏️ Custom":
         return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
-    elif task == "📍 Locate":
         return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
     return gr.update(visible=False)
-def select_boxes(task):
     if task == "📍 Locate":
         return gr.update(selected="tab_boxes")
     return gr.update()
-def get_pdf_page_count(file_path):
-    if not file_path or not file_path.lower().endswith('.pdf'):
         return 1
     doc = fitz.open(file_path)
     count = len(doc)
     doc.close()
     return count
-def load_image(file_path, page_num=1):
     if not file_path:
         return None
-    if file_path.lower().endswith('.pdf'):
         doc = fitz.open(file_path)
         page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
         page = doc.load_page(page_idx)
-        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
         img = Image.open(BytesIO(pix.tobytes("png")))
         doc.close()
         return img
-    else:
-        return Image.open(file_path)
-def update_page_selector(file_path):
     if not file_path:
         return gr.update(visible=False)
-    if file_path.lower().endswith('.pdf'):
         page_count = get_pdf_page_count(file_path)
-        return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
-                        label=f"Select Page (1-{page_count})")
     return gr.update(visible=False)
-with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
-    gr.Markdown("""
-    # 🚀 DeepSeek-OCR Demo
-    **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes. It takes 20~ sec for markdown and 3~ sec for locate task examples. Check the info at the bottom of the page for more information.**
-    **Hope this tool was helpful! If so, a quick like ❤️ would mean a lot :)**
-    """)
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             input_img = gr.Image(label="Input Image", type="pil", height=300)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
                 with gr.Tab("Text", id="tab_text"):
@@ -263,25 +546,58 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
                     gallery = gr.Gallery(show_label=False, columns=3, height=400)
                 with gr.Tab("Raw Text", id="tab_raw"):
                     raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
     file_in.change(load_image, [file_in, page_selector], [input_img])
     file_in.change(update_page_selector, [file_in], [page_selector])
     page_selector.change(load_image, [file_in, page_selector], [input_img])
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
     def run(image, file_path, mode, task, custom_prompt, page_num):
         if file_path:
             return process_file(file_path, mode, task, custom_prompt, int(page_num))
         if image is not None:
             return process_image(image, mode, task, custom_prompt)
-        return "Error uploading file or image", "", "", None, []
-    submit_event = btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
-                             [text_out, md_out, raw_out, img_out, gallery])
     submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import os
 import sys
 import re
+import shutil
+import tempfile
 import warnings
 import base64
 from io import StringIO, BytesIO
+from typing import List, Tuple
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import fitz  # PyMuPDF
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    AutoProcessor,
+    VisionEncoderDecoderModel,
+    BlipProcessor,
+    BlipForConditionalGeneration,
+)
+# --- Optional HF Spaces GPU decorator (safe fallback for local runs) ---
+try:
+    import spaces  # type: ignore
+    gpu_decorator = spaces.GPU
+except Exception:
+    def gpu_decorator(*args, **kwargs):
+        def wrap(fn):
+            return fn
+        return wrap
+# =========================
+# Device / dtype utilities
+# =========================
+def get_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def get_cuda_dtype() -> torch.dtype:
+    # bf16 only on supported GPUs (Ampere+). Otherwise fp16.
+    try:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            return torch.bfloat16
+    except Exception:
+        pass
+    return torch.float16
+DEVICE = get_device()
+CUDA_DTYPE = get_cuda_dtype() if DEVICE == "cuda" else torch.float32
+# =========================
+# Model names
+# =========================
+DEEPSEEK_OCR_NAME = os.getenv("DEEPSEEK_OCR_MODEL", "deepseek-ai/DeepSeek-OCR")
+# Optional pin to a specific revision/commit to avoid auto-updating remote code.
+DEEPSEEK_OCR_REVISION = os.getenv("DEEPSEEK_OCR_REVISION", None)
+TROCR_NAME = os.getenv("TROCR_MODEL", "microsoft/trocr-base-printed")
+BLIP_NAME = os.getenv("BLIP_MODEL", "Salesforce/blip-image-captioning-base")
+# =========================
+# Load DeepSeek-OCR safely
+# =========================
+def load_deepseek_ocr():
+    tokenizer = AutoTokenizer.from_pretrained(
+        DEEPSEEK_OCR_NAME,
+        trust_remote_code=True,
+        revision=DEEPSEEK_OCR_REVISION,
+    )
+    base_kwargs = dict(
+        trust_remote_code=True,
+        use_safetensors=True,
+        revision=DEEPSEEK_OCR_REVISION,
+    )
+    # IMPORTANT:
+    # - Do NOT force flash_attention_2 on CPU.
+    # - On CUDA: try flash_attention_2, but gracefully fallback if unavailable.
+    if DEVICE == "cuda":
+        # Try FlashAttention2 first
+        try:
+            model = AutoModel.from_pretrained(
+                DEEPSEEK_OCR_NAME,
+                torch_dtype=CUDA_DTYPE,
+                _attn_implementation="flash_attention_2",
+                **base_kwargs,
+            )
+        except Exception as e:
+            warnings.warn(
+                f"FlashAttention2 unavailable or failed ({e}). Falling back to SDPA/eager."
+            )
+            # Try SDPA
+            try:
+                model = AutoModel.from_pretrained(
+                    DEEPSEEK_OCR_NAME,
+                    torch_dtype=CUDA_DTYPE,
+                    _attn_implementation="sdpa",
+                    **base_kwargs,
+                )
+            except Exception:
+                # Final fallback
+                model = AutoModel.from_pretrained(
+                    DEEPSEEK_OCR_NAME,
+                    torch_dtype=CUDA_DTYPE,
+                    _attn_implementation="eager",
+                    **base_kwargs,
+                )
+        model = model.eval().to(DEVICE)
+    else:
+        # CPU path: no flash attention, use float32 for stability
+        model = AutoModel.from_pretrained(
+            DEEPSEEK_OCR_NAME,
+            torch_dtype=torch.float32,
+            _attn_implementation="eager",
+            **base_kwargs,
+        )
+        model = model.eval().to(DEVICE)
+    return tokenizer, model
+tokenizer, deepseek_model = load_deepseek_ocr()
+# =========================
+# Load TrOCR and BLIP
+# =========================
+def load_trocr():
+    processor = AutoProcessor.from_pretrained(TROCR_NAME)
+    model = VisionEncoderDecoderModel.from_pretrained(TROCR_NAME).eval()
+    if DEVICE == "cuda":
+        model = model.to(DEVICE).to(dtype=CUDA_DTYPE)
+    else:
+        model = model.to(DEVICE)
+    return processor, model
+def load_blip():
+    processor = BlipProcessor.from_pretrained(BLIP_NAME)
+    model = BlipForConditionalGeneration.from_pretrained(BLIP_NAME).eval()
+    if DEVICE == "cuda":
+        model = model.to(DEVICE).to(dtype=CUDA_DTYPE)
+    else:
+        model = model.to(DEVICE)
+    return processor, model
+trocr_processor, trocr_model = load_trocr()
+blip_processor, blip_model = load_blip()
+# =========================
+# App configs
+# =========================
 MODEL_CONFIGS = {
     "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
     "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
     "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
     "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+    "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
 }
 TASK_PROMPTS = {
+    "📋 Markdown": {
+        "prompt": "<image>\n<|grounding|>Convert the document to markdown.",
+        "has_grounding": True,
+    },
+    # NOTE: Free OCR теперь делаем через TrOCR (быстро, text-only)
+    "📝 Free OCR": {"prompt": "", "has_grounding": False},
+    # Locate оставляем на DeepSeek (grounding)
+    "📍 Locate": {
+        "prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.",
+        "has_grounding": True,
+    },
+    # Describe теперь делаем через BLIP
+    "🔍 Describe": {"prompt": "", "has_grounding": False},
+    "✏️ Custom": {"prompt": "", "has_grounding": False},
 }
+# =========================
+# Helpers
+# =========================
+def safe_load_font(size: int = 30) -> ImageFont.FreeTypeFont:
+    candidates = [
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+    ]
+    for p in candidates:
+        try:
+            if os.path.exists(p):
+                return ImageFont.truetype(p, size)
+        except Exception:
+            continue
+    return ImageFont.load_default()
+def extract_grounding_references(text: str):
+    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
     return re.findall(pattern, text, re.DOTALL)
+def draw_bounding_boxes(image: Image.Image, refs, extract_images: bool = False):
     img_w, img_h = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
+    overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
     draw2 = ImageDraw.Draw(overlay)
+    font = safe_load_font(30)
     crops = []
     color_map = {}
     np.random.seed(42)
     for ref in refs:
         label = ref[1]
         if label not in color_map:
+            color_map[label] = (
+                int(np.random.randint(50, 255)),
+                int(np.random.randint(50, 255)),
+                int(np.random.randint(50, 255)),
+            )
         color = color_map[label]
+        try:
+            coords = eval(ref[2])
+        except Exception:
+            continue
         color_a = color + (60,)
         for box in coords:
+            x1, y1, x2, y2 = (
+                int(box[0] / 999 * img_w),
+                int(box[1] / 999 * img_h),
+                int(box[2] / 999 * img_w),
+                int(box[3] / 999 * img_h),
+            )
+            if extract_images and label == "image":
                 crops.append(image.crop((x1, y1, x2, y2)))
+            width = 5 if label == "title" else 3
             draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
             draw2.rectangle([x1, y1, x2, y2], fill=color_a)
             text_bbox = draw.textbbox((0, 0), label, font=font)
             tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
             ty = max(0, y1 - 20)
             draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
             draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
+def clean_output(text: str, include_images: bool = False) -> str:
     if not text:
         return ""
+    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
     matches = re.findall(pattern, text, re.DOTALL)
     img_num = 0
     for match in matches:
+        if "<|ref|>image<|/ref|>" in match[0]:
             if include_images:
+                text = text.replace(match[0], f"\n\n**[Figure {img_num + 1}]**\n\n", 1)
                 img_num += 1
             else:
+                text = text.replace(match[0], "", 1)
         else:
+            text = re.sub(rf"(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?", "", text)
     return text.strip()
+def embed_images(markdown: str, crops: List[Image.Image]) -> str:
     if not crops:
         return markdown
     for i, img in enumerate(crops):
         buf = BytesIO()
         img.save(buf, format="PNG")
         b64 = base64.b64encode(buf.getvalue()).decode()
+        markdown = markdown.replace(
+            f"**[Figure {i + 1}]**",
+            f"\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n",
+            1,
+        )
     return markdown
+def trocr_ocr(image: Image.Image) -> str:
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    pixel_values = trocr_processor(images=image, return_tensors="pt").pixel_values.to(DEVICE)
+    with torch.no_grad():
+        # Keep generation modest (faster)
+        generated_ids = trocr_model.generate(pixel_values, max_new_tokens=256)
+    text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return text.strip()
+def blip_describe(image: Image.Image) -> str:
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    inputs = blip_processor(images=image, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        out = blip_model.generate(**inputs, max_new_tokens=80)
+    caption = blip_processor.decode(out[0], skip_special_tokens=True)
+    return caption.strip()
+# =========================
+# Core processing
+# =========================
+@gpu_decorator(duration=60)
+def process_image(image: Image.Image, mode: str, task: str, custom_prompt: str):
     if image is None:
+        return "Error: upload image", "", "", None, []
     if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
+        return "Error: enter prompt", "", "", None, []
+    if image.mode in ("RGBA", "LA", "P"):
+        image = image.convert("RGB")
     image = ImageOps.exif_transpose(image)
+    # --- Route tasks to the best backend ---
+    if task == "📝 Free OCR":
+        text = trocr_ocr(image)
+        if not text:
+            return "No text", "", "", None, []
+        md = "```text\n" + text + "\n```"
+        return text, md, text, None, []
+    if task == "🔍 Describe":
+        desc = blip_describe(image)
+        if not desc:
+            return "No description", "", "", None, []
+        md = f"**Description:** {desc}"
+        return desc, md, desc, None, []
+    # --- DeepSeek-OCR for Markdown / Locate / Custom ---
     config = MODEL_CONFIGS[mode]
     if task == "✏️ Custom":
         prompt = f"<image>\n{custom_prompt.strip()}"
+        has_grounding = "<|grounding|>" in custom_prompt
     elif task == "📍 Locate":
         prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
         has_grounding = True
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
+    image.save(tmp.name, "JPEG", quality=95)
     tmp.close()
     out_dir = tempfile.mkdtemp()
     stdout = sys.stdout
     sys.stdout = StringIO()
+    try:
+        deepseek_model.infer(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            image_file=tmp.name,
+            output_path=out_dir,
+            base_size=config["base_size"],
+            image_size=config["image_size"],
+            crop_mode=config["crop_mode"],
+        )
+        result = "\n".join(
+            [
+                l
+                for l in sys.stdout.getvalue().split("\n")
+                if not any(
+                    s in l
+                    for s in [
+                        "image:",
+                        "other:",
+                        "PATCHES",
+                        "====",
+                        "BASE:",
+                        "%|",
+                        "torch.Size",
+                    ]
+                )
+            ]
+        ).strip()
+    finally:
+        sys.stdout = stdout
+        try:
+            os.unlink(tmp.name)
+        except Exception:
+            pass
+        shutil.rmtree(out_dir, ignore_errors=True)
     if not result:
         return "No text", "", "", None, []
+    cleaned = clean_output(result, include_images=False)
+    markdown = clean_output(result, include_images=True)
     img_out = None
     crops = []
+    if has_grounding and "<|ref|>" in result:
         refs = extract_grounding_references(result)
         if refs:
+            img_out, crops = draw_bounding_boxes(image, refs, extract_images=True)
     markdown = embed_images(markdown, crops)
     return cleaned, markdown, result, img_out, crops
+@gpu_decorator(duration=60)
+def process_pdf(path: str, mode: str, task: str, custom_prompt: str, page_num: int):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
         doc.close()
         return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
     page = doc.load_page(page_num - 1)
+    pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
     img = Image.open(BytesIO(pix.tobytes("png")))
     doc.close()
     return process_image(img, mode, task, custom_prompt)
+def process_file(path: str, mode: str, task: str, custom_prompt: str, page_num: int):
     if not path:
+        return "Error: upload file", "", "", None, []
+    if path.lower().endswith(".pdf"):
         return process_pdf(path, mode, task, custom_prompt, page_num)
+    return process_image(Image.open(path), mode, task, custom_prompt)
+def toggle_prompt(task: str):
     if task == "✏️ Custom":
         return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
+    if task == "📍 Locate":
         return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
     return gr.update(visible=False)
+def select_boxes(task: str):
     if task == "📍 Locate":
         return gr.update(selected="tab_boxes")
     return gr.update()
+def get_pdf_page_count(file_path: str) -> int:
+    if not file_path or not file_path.lower().endswith(".pdf"):
         return 1
     doc = fitz.open(file_path)
     count = len(doc)
     doc.close()
     return count
+def load_image(file_path: str, page_num: int = 1):
     if not file_path:
         return None
+    if file_path.lower().endswith(".pdf"):
         doc = fitz.open(file_path)
         page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
         page = doc.load_page(page_idx)
+        pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
         img = Image.open(BytesIO(pix.tobytes("png")))
         doc.close()
         return img
+    return Image.open(file_path)
+def update_page_selector(file_path: str):
     if not file_path:
         return gr.update(visible=False)
+    if file_path.lower().endswith(".pdf"):
         page_count = get_pdf_page_count(file_path)
+        return gr.update(
+            visible=True,
+            maximum=page_count,
+            value=1,
+            minimum=1,
+            label=f"Select Page (1-{page_count})",
+        )
     return gr.update(visible=False)
+# =========================
+# UI
+# =========================
+with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR + TrOCR + BLIP") as demo:
+    gr.Markdown(
+        f"""
+# DeepSeek-OCR Demo (with TrOCR + BLIP)
+This app supports:
+- **Markdown**: DeepSeek-OCR (structured markdown + optional grounding boxes)
+- **Free OCR**: TrOCR (fast text-only OCR)
+- **Locate**: DeepSeek-OCR (grounding boxes)
+- **Describe**: BLIP (image captioning)
+Runtime device: **{DEVICE}**
+"""
+    )
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             input_img = gr.Image(label="Input Image", type="pil", height=300)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
                 with gr.Tab("Text", id="tab_text"):
                     gallery = gr.Gallery(show_label=False, columns=3, height=400)
                 with gr.Tab("Raw Text", id="tab_raw"):
                     raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
+    # Better examples: populate File input (works for both image/pdf paths inside repo)
+    gr.Examples(
+        examples=[
+            ["examples/ocr.jpg", "Gundam", "📋 Markdown", "", 1],
+            ["examples/reachy-mini.jpg", "Gundam", "📍 Locate", "Robot", 1],
+        ],
+        inputs=[file_in, mode, task, prompt, page_selector],
+        cache_examples=False,
+    )
+    with gr.Accordion("ℹ️ Info", open=False):
+        gr.Markdown(
+            """
+### Modes
+- **Gundam**: 1024 base + 640 tiles with cropping - Best balance
+- **Tiny**: 512×512, no crop - Fastest
+- **Small**: 640×640, no crop - Quick
+- **Base**: 1024×1024, no crop - Standard
+- **Large**: 1280×1280, no crop - Highest quality
+### Tasks
+- **📋 Markdown**: DeepSeek-OCR → structured markdown (grounding ✅)
+- **📝 Free OCR**: TrOCR → fast text-only OCR
+- **📍 Locate**: DeepSeek-OCR → bounding boxes (grounding ✅)
+- **🔍 Describe**: BLIP → short image description
+- **✏️ Custom**: DeepSeek-OCR prompt (add `<|grounding|>` for boxes)
+"""
+        )
+    # File / PDF page handling
     file_in.change(load_image, [file_in, page_selector], [input_img])
     file_in.change(update_page_selector, [file_in], [page_selector])
     page_selector.change(load_image, [file_in, page_selector], [input_img])
+    # Prompt visibility and tab switch
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
     def run(image, file_path, mode, task, custom_prompt, page_num):
         if file_path:
             return process_file(file_path, mode, task, custom_prompt, int(page_num))
         if image is not None:
             return process_image(image, mode, task, custom_prompt)
+        return "Error: upload file or image", "", "", None, []
+    submit_event = btn.click(
+        run,
+        [input_img, file_in, mode, task, prompt, page_selector],
+        [text_out, md_out, raw_out, img_out, gallery],
+    )
     submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()