Spaces:

Vishwas1
/

dots_ocr_explore

Runtime error

File size: 6,023 Bytes

498832d

import os
import io
import tempfile
import shutil
from pathlib import Path
import gradio as gr
from PIL import Image
import fitz  # PyMuPDF

# --- dots.ocr imports ---
# We install dots.ocr in requirements.txt and download weights at startup.
# The repo recommends saving weights under a path WITHOUT dots in the directory name.
# Ref: README notes about "DotsOCR" dir. (See citations)
from dots_ocr import pipeline as dots_pipeline  # if module layout differs, adjust to: from dots_ocr.pipeline import DotsOCR

WEIGHTS_DIR = Path(os.getenv("DOTS_WEIGHTS_DIR", "weights")) / "DotsOCR"
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)

# Lazily load the model once per Space session
DOTS = None

def load_model():
    global DOTS
    if DOTS is None:
        # The library exposes a simple pipeline init; if upstream renames it,
        # swap to their documented API or helper. The HF Space by others uses similar.
        # You can pass prompt presets like "prompt_ocr" or "prompt_grounding_ocr" later.
        DOTS = dots_pipeline.load_model(model_dir=str(WEIGHTS_DIR))
    return DOTS

def pdf_to_images(pdf_bytes):
    """Return a list of PIL images from a PDF file bytes."""
    imgs = []
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page in doc:
            pix = page.get_pixmap(alpha=False, dpi=180)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            imgs.append(img)
    return imgs

# --- Devanagari normalization helpers ---
import re
# Zero-width joiners
ZWJ = "\u200D"
ZWNJ = "\u200C"
NUKTA = "\u093C"

def normalize_devanagari(text: str) -> str:
    """Light-weight cleanup: normalize nukta sequences, strip stray ZWJ/ZWNJ, and fix hyphen linebreaks."""
    if not text:
        return text
    # Remove stray joiners between letters (keep when used in known conjunct patterns if needed)
    text = text.replace(ZWJ, "").replace(ZWNJ, "")
    # Fix hyphenated line breaks: "मा-\n झी" -> "माझी"
    text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text, flags=re.UNICODE)
    # Collapse multiple spaces/newlines
    text = re.sub(r"[ \t]+", " ", text, flags=re.UNICODE)
    text = re.sub(r"\n{3,}", "\n\n", text, flags=re.UNICODE)
    return text.strip()

def run_ocr(files, task, lang_hint, return_markdown, show_boxes):
    model = load_model()

    # Prepare images
    images = []
    for f in files:
        if f.name.lower().endswith(".pdf"):
            images.extend(pdf_to_images(f.read()))
        else:
            images.append(Image.open(f.name).convert("RGB"))

    # Choose prompt by task
    # From repo notes: switch between tasks via prompt like "prompt_ocr", "prompt_layout_only_en", "prompt_grounding_ocr".
    # We'll use OCR-focused prompt and optionally layout for Markdown.
    prompt = "prompt_ocr" if not show_boxes else "prompt_grounding_ocr"

    # Optional language/script hint to guide decoding. Devanagari covers Hindi/Marathi/Sanskrit.
    # If upstream exposes a language arg, pass it; otherwise keep in the prompt text.
    if lang_hint and lang_hint != "Auto":
        prompt = f"{prompt} ({lang_hint})"

    # Inference
    results = []
    for img in images:
        out = model.infer(img, prompt=prompt, as_markdown=return_markdown, with_layout=return_markdown)
        # out can be dict with 'markdown', 'text', 'boxes' depending on upstream.
        # Normalize Devanagari when applicable
        text_md = out.get("markdown") if return_markdown else out.get("text") or out.get("markdown", "")
        if lang_hint.startswith("Devanagari") or "Hindi" in lang_hint or "Marathi" in lang_hint or "Sanskrit" in lang_hint:
            text_md = normalize_devanagari(text_md)

        results.append({
            "text": text_md,
            "boxes_image": out.get("overlay_image") if show_boxes else None
        })

    # Aggregate outputs
    combined_text = "\n\n".join([r["text"] for r in results if r["text"]])
    # Save downloadable files
    md_path, txt_path = None, None
    if return_markdown:
        md_path = "output.md"
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(combined_text)
    txt_path = "output.txt"
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(re.sub(r"\n{3,}", "\n\n", combined_text))

    overlay_gallery = [r["boxes_image"] for r in results if r["boxes_image"] is not None]
    return combined_text, (md_path if return_markdown else None), txt_path, overlay_gallery

with gr.Blocks(title="Dots OCR — Indic-ready") as demo:
    gr.Markdown(
        "# dots.ocr — Multilingual OCR\n"
        "Upload PDFs or images. For Hindi/Marathi/Sanskrit, choose **Devanagari**.\n"
        "Outputs: Markdown + Plain text. (Best on GPU)"
    )
    with gr.Row():
        files = gr.File(label="PDF(s) or image(s)", file_count="multiple", type="filepath")
        return_markdown = gr.Checkbox(True, label="Return Markdown")
    with gr.Row():
        task = gr.Radio(choices=["OCR"], value="OCR", label="Task")
        lang_hint = gr.Dropdown(
            label="Language/Script hint",
            choices=["Auto", "Devanagari (Hindi/Marathi/Sanskrit)", "Gujarati", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Punjabi (Gurmukhi)", "Urdu (Arabic)"],
            value="Devanagari (Hindi/Marathi/Sanskrit)"
        )
        show_boxes = gr.Checkbox(False, label="Show layout boxes (slower)")
    btn = gr.Button("Run")

    out_text = gr.Textbox(label="Recognized text (Markdown or Plain Text)", lines=20)
    out_md = gr.File(label="Download Markdown", visible=True)
    out_txt = gr.File(label="Download Plain Text", visible=True)
    overlay = gr.Gallery(label="Layout overlays", visible=False)

    def _toggle(v):
        return gr.update(visible=v)

    show_boxes.change(_toggle, inputs=show_boxes, outputs=overlay)
    btn.click(run_ocr, inputs=[files, task, lang_hint, return_markdown, show_boxes],
              outputs=[out_text, out_md, out_txt, overlay])

if __name__ == "__main__":
    demo.launch()