Spaces:
Runtime error
Runtime error
File size: 6,023 Bytes
498832d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | import os
import io
import tempfile
import shutil
from pathlib import Path
import gradio as gr
from PIL import Image
import fitz # PyMuPDF
# --- dots.ocr imports ---
# We install dots.ocr in requirements.txt and download weights at startup.
# The repo recommends saving weights under a path WITHOUT dots in the directory name.
# Ref: README notes about "DotsOCR" dir. (See citations)
from dots_ocr import pipeline as dots_pipeline # if module layout differs, adjust to: from dots_ocr.pipeline import DotsOCR
WEIGHTS_DIR = Path(os.getenv("DOTS_WEIGHTS_DIR", "weights")) / "DotsOCR"
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
# Lazily load the model once per Space session
DOTS = None
def load_model():
global DOTS
if DOTS is None:
# The library exposes a simple pipeline init; if upstream renames it,
# swap to their documented API or helper. The HF Space by others uses similar.
# You can pass prompt presets like "prompt_ocr" or "prompt_grounding_ocr" later.
DOTS = dots_pipeline.load_model(model_dir=str(WEIGHTS_DIR))
return DOTS
def pdf_to_images(pdf_bytes):
"""Return a list of PIL images from a PDF file bytes."""
imgs = []
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
for page in doc:
pix = page.get_pixmap(alpha=False, dpi=180)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
imgs.append(img)
return imgs
# --- Devanagari normalization helpers ---
import re
# Zero-width joiners
ZWJ = "\u200D"
ZWNJ = "\u200C"
NUKTA = "\u093C"
def normalize_devanagari(text: str) -> str:
"""Light-weight cleanup: normalize nukta sequences, strip stray ZWJ/ZWNJ, and fix hyphen linebreaks."""
if not text:
return text
# Remove stray joiners between letters (keep when used in known conjunct patterns if needed)
text = text.replace(ZWJ, "").replace(ZWNJ, "")
# Fix hyphenated line breaks: "मा-\n झी" -> "माझी"
text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text, flags=re.UNICODE)
# Collapse multiple spaces/newlines
text = re.sub(r"[ \t]+", " ", text, flags=re.UNICODE)
text = re.sub(r"\n{3,}", "\n\n", text, flags=re.UNICODE)
return text.strip()
def run_ocr(files, task, lang_hint, return_markdown, show_boxes):
model = load_model()
# Prepare images
images = []
for f in files:
if f.name.lower().endswith(".pdf"):
images.extend(pdf_to_images(f.read()))
else:
images.append(Image.open(f.name).convert("RGB"))
# Choose prompt by task
# From repo notes: switch between tasks via prompt like "prompt_ocr", "prompt_layout_only_en", "prompt_grounding_ocr".
# We'll use OCR-focused prompt and optionally layout for Markdown.
prompt = "prompt_ocr" if not show_boxes else "prompt_grounding_ocr"
# Optional language/script hint to guide decoding. Devanagari covers Hindi/Marathi/Sanskrit.
# If upstream exposes a language arg, pass it; otherwise keep in the prompt text.
if lang_hint and lang_hint != "Auto":
prompt = f"{prompt} ({lang_hint})"
# Inference
results = []
for img in images:
out = model.infer(img, prompt=prompt, as_markdown=return_markdown, with_layout=return_markdown)
# out can be dict with 'markdown', 'text', 'boxes' depending on upstream.
# Normalize Devanagari when applicable
text_md = out.get("markdown") if return_markdown else out.get("text") or out.get("markdown", "")
if lang_hint.startswith("Devanagari") or "Hindi" in lang_hint or "Marathi" in lang_hint or "Sanskrit" in lang_hint:
text_md = normalize_devanagari(text_md)
results.append({
"text": text_md,
"boxes_image": out.get("overlay_image") if show_boxes else None
})
# Aggregate outputs
combined_text = "\n\n".join([r["text"] for r in results if r["text"]])
# Save downloadable files
md_path, txt_path = None, None
if return_markdown:
md_path = "output.md"
with open(md_path, "w", encoding="utf-8") as f:
f.write(combined_text)
txt_path = "output.txt"
with open(txt_path, "w", encoding="utf-8") as f:
f.write(re.sub(r"\n{3,}", "\n\n", combined_text))
overlay_gallery = [r["boxes_image"] for r in results if r["boxes_image"] is not None]
return combined_text, (md_path if return_markdown else None), txt_path, overlay_gallery
with gr.Blocks(title="Dots OCR — Indic-ready") as demo:
gr.Markdown(
"# dots.ocr — Multilingual OCR\n"
"Upload PDFs or images. For Hindi/Marathi/Sanskrit, choose **Devanagari**.\n"
"Outputs: Markdown + Plain text. (Best on GPU)"
)
with gr.Row():
files = gr.File(label="PDF(s) or image(s)", file_count="multiple", type="filepath")
return_markdown = gr.Checkbox(True, label="Return Markdown")
with gr.Row():
task = gr.Radio(choices=["OCR"], value="OCR", label="Task")
lang_hint = gr.Dropdown(
label="Language/Script hint",
choices=["Auto", "Devanagari (Hindi/Marathi/Sanskrit)", "Gujarati", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Punjabi (Gurmukhi)", "Urdu (Arabic)"],
value="Devanagari (Hindi/Marathi/Sanskrit)"
)
show_boxes = gr.Checkbox(False, label="Show layout boxes (slower)")
btn = gr.Button("Run")
out_text = gr.Textbox(label="Recognized text (Markdown or Plain Text)", lines=20)
out_md = gr.File(label="Download Markdown", visible=True)
out_txt = gr.File(label="Download Plain Text", visible=True)
overlay = gr.Gallery(label="Layout overlays", visible=False)
def _toggle(v):
return gr.update(visible=v)
show_boxes.change(_toggle, inputs=show_boxes, outputs=overlay)
btn.click(run_ocr, inputs=[files, task, lang_hint, return_markdown, show_boxes],
outputs=[out_text, out_md, out_txt, overlay])
if __name__ == "__main__":
demo.launch()
|