File size: 6,023 Bytes
498832d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import io
import tempfile
import shutil
from pathlib import Path
import gradio as gr
from PIL import Image
import fitz  # PyMuPDF

# --- dots.ocr imports ---
# We install dots.ocr in requirements.txt and download weights at startup.
# The repo recommends saving weights under a path WITHOUT dots in the directory name.
# Ref: README notes about "DotsOCR" dir. (See citations)
from dots_ocr import pipeline as dots_pipeline  # if module layout differs, adjust to: from dots_ocr.pipeline import DotsOCR

WEIGHTS_DIR = Path(os.getenv("DOTS_WEIGHTS_DIR", "weights")) / "DotsOCR"
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)

# Lazily load the model once per Space session
DOTS = None

def load_model():
    global DOTS
    if DOTS is None:
        # The library exposes a simple pipeline init; if upstream renames it,
        # swap to their documented API or helper. The HF Space by others uses similar.
        # You can pass prompt presets like "prompt_ocr" or "prompt_grounding_ocr" later.
        DOTS = dots_pipeline.load_model(model_dir=str(WEIGHTS_DIR))
    return DOTS

def pdf_to_images(pdf_bytes):
    """Return a list of PIL images from a PDF file bytes."""
    imgs = []
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page in doc:
            pix = page.get_pixmap(alpha=False, dpi=180)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            imgs.append(img)
    return imgs

# --- Devanagari normalization helpers ---
import re
# Zero-width joiners
ZWJ = "\u200D"
ZWNJ = "\u200C"
NUKTA = "\u093C"

def normalize_devanagari(text: str) -> str:
    """Light-weight cleanup: normalize nukta sequences, strip stray ZWJ/ZWNJ, and fix hyphen linebreaks."""
    if not text:
        return text
    # Remove stray joiners between letters (keep when used in known conjunct patterns if needed)
    text = text.replace(ZWJ, "").replace(ZWNJ, "")
    # Fix hyphenated line breaks: "मा-\n झी" -> "माझी"
    text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text, flags=re.UNICODE)
    # Collapse multiple spaces/newlines
    text = re.sub(r"[ \t]+", " ", text, flags=re.UNICODE)
    text = re.sub(r"\n{3,}", "\n\n", text, flags=re.UNICODE)
    return text.strip()

def run_ocr(files, task, lang_hint, return_markdown, show_boxes):
    model = load_model()

    # Prepare images
    images = []
    for f in files:
        if f.name.lower().endswith(".pdf"):
            images.extend(pdf_to_images(f.read()))
        else:
            images.append(Image.open(f.name).convert("RGB"))

    # Choose prompt by task
    # From repo notes: switch between tasks via prompt like "prompt_ocr", "prompt_layout_only_en", "prompt_grounding_ocr".
    # We'll use OCR-focused prompt and optionally layout for Markdown.
    prompt = "prompt_ocr" if not show_boxes else "prompt_grounding_ocr"

    # Optional language/script hint to guide decoding. Devanagari covers Hindi/Marathi/Sanskrit.
    # If upstream exposes a language arg, pass it; otherwise keep in the prompt text.
    if lang_hint and lang_hint != "Auto":
        prompt = f"{prompt} ({lang_hint})"

    # Inference
    results = []
    for img in images:
        out = model.infer(img, prompt=prompt, as_markdown=return_markdown, with_layout=return_markdown)
        # out can be dict with 'markdown', 'text', 'boxes' depending on upstream.
        # Normalize Devanagari when applicable
        text_md = out.get("markdown") if return_markdown else out.get("text") or out.get("markdown", "")
        if lang_hint.startswith("Devanagari") or "Hindi" in lang_hint or "Marathi" in lang_hint or "Sanskrit" in lang_hint:
            text_md = normalize_devanagari(text_md)

        results.append({
            "text": text_md,
            "boxes_image": out.get("overlay_image") if show_boxes else None
        })

    # Aggregate outputs
    combined_text = "\n\n".join([r["text"] for r in results if r["text"]])
    # Save downloadable files
    md_path, txt_path = None, None
    if return_markdown:
        md_path = "output.md"
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(combined_text)
    txt_path = "output.txt"
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(re.sub(r"\n{3,}", "\n\n", combined_text))

    overlay_gallery = [r["boxes_image"] for r in results if r["boxes_image"] is not None]
    return combined_text, (md_path if return_markdown else None), txt_path, overlay_gallery

with gr.Blocks(title="Dots OCR — Indic-ready") as demo:
    gr.Markdown(
        "# dots.ocr — Multilingual OCR\n"
        "Upload PDFs or images. For Hindi/Marathi/Sanskrit, choose **Devanagari**.\n"
        "Outputs: Markdown + Plain text. (Best on GPU)"
    )
    with gr.Row():
        files = gr.File(label="PDF(s) or image(s)", file_count="multiple", type="filepath")
        return_markdown = gr.Checkbox(True, label="Return Markdown")
    with gr.Row():
        task = gr.Radio(choices=["OCR"], value="OCR", label="Task")
        lang_hint = gr.Dropdown(
            label="Language/Script hint",
            choices=["Auto", "Devanagari (Hindi/Marathi/Sanskrit)", "Gujarati", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Punjabi (Gurmukhi)", "Urdu (Arabic)"],
            value="Devanagari (Hindi/Marathi/Sanskrit)"
        )
        show_boxes = gr.Checkbox(False, label="Show layout boxes (slower)")
    btn = gr.Button("Run")

    out_text = gr.Textbox(label="Recognized text (Markdown or Plain Text)", lines=20)
    out_md = gr.File(label="Download Markdown", visible=True)
    out_txt = gr.File(label="Download Plain Text", visible=True)
    overlay = gr.Gallery(label="Layout overlays", visible=False)

    def _toggle(v):
        return gr.update(visible=v)

    show_boxes.change(_toggle, inputs=show_boxes, outputs=overlay)
    btn.click(run_ocr, inputs=[files, task, lang_hint, return_markdown, show_boxes],
              outputs=[out_text, out_md, out_txt, overlay])

if __name__ == "__main__":
    demo.launch()