Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import tempfile | |
| import shutil | |
| from pathlib import Path | |
| import gradio as gr | |
| from PIL import Image | |
| import fitz # PyMuPDF | |
| # --- dots.ocr imports --- | |
| # We install dots.ocr in requirements.txt and download weights at startup. | |
| # The repo recommends saving weights under a path WITHOUT dots in the directory name. | |
| # Ref: README notes about "DotsOCR" dir. (See citations) | |
| from dots_ocr import pipeline as dots_pipeline # if module layout differs, adjust to: from dots_ocr.pipeline import DotsOCR | |
| WEIGHTS_DIR = Path(os.getenv("DOTS_WEIGHTS_DIR", "weights")) / "DotsOCR" | |
| WEIGHTS_DIR.mkdir(parents=True, exist_ok=True) | |
| # Lazily load the model once per Space session | |
| DOTS = None | |
| def load_model(): | |
| global DOTS | |
| if DOTS is None: | |
| # The library exposes a simple pipeline init; if upstream renames it, | |
| # swap to their documented API or helper. The HF Space by others uses similar. | |
| # You can pass prompt presets like "prompt_ocr" or "prompt_grounding_ocr" later. | |
| DOTS = dots_pipeline.load_model(model_dir=str(WEIGHTS_DIR)) | |
| return DOTS | |
| def pdf_to_images(pdf_bytes): | |
| """Return a list of PIL images from a PDF file bytes.""" | |
| imgs = [] | |
| with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
| for page in doc: | |
| pix = page.get_pixmap(alpha=False, dpi=180) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| imgs.append(img) | |
| return imgs | |
| # --- Devanagari normalization helpers --- | |
| import re | |
| # Zero-width joiners | |
| ZWJ = "\u200D" | |
| ZWNJ = "\u200C" | |
| NUKTA = "\u093C" | |
| def normalize_devanagari(text: str) -> str: | |
| """Light-weight cleanup: normalize nukta sequences, strip stray ZWJ/ZWNJ, and fix hyphen linebreaks.""" | |
| if not text: | |
| return text | |
| # Remove stray joiners between letters (keep when used in known conjunct patterns if needed) | |
| text = text.replace(ZWJ, "").replace(ZWNJ, "") | |
| # Fix hyphenated line breaks: "मा-\n झी" -> "माझी" | |
| text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text, flags=re.UNICODE) | |
| # Collapse multiple spaces/newlines | |
| text = re.sub(r"[ \t]+", " ", text, flags=re.UNICODE) | |
| text = re.sub(r"\n{3,}", "\n\n", text, flags=re.UNICODE) | |
| return text.strip() | |
| def run_ocr(files, task, lang_hint, return_markdown, show_boxes): | |
| model = load_model() | |
| # Prepare images | |
| images = [] | |
| for f in files: | |
| if f.name.lower().endswith(".pdf"): | |
| images.extend(pdf_to_images(f.read())) | |
| else: | |
| images.append(Image.open(f.name).convert("RGB")) | |
| # Choose prompt by task | |
| # From repo notes: switch between tasks via prompt like "prompt_ocr", "prompt_layout_only_en", "prompt_grounding_ocr". | |
| # We'll use OCR-focused prompt and optionally layout for Markdown. | |
| prompt = "prompt_ocr" if not show_boxes else "prompt_grounding_ocr" | |
| # Optional language/script hint to guide decoding. Devanagari covers Hindi/Marathi/Sanskrit. | |
| # If upstream exposes a language arg, pass it; otherwise keep in the prompt text. | |
| if lang_hint and lang_hint != "Auto": | |
| prompt = f"{prompt} ({lang_hint})" | |
| # Inference | |
| results = [] | |
| for img in images: | |
| out = model.infer(img, prompt=prompt, as_markdown=return_markdown, with_layout=return_markdown) | |
| # out can be dict with 'markdown', 'text', 'boxes' depending on upstream. | |
| # Normalize Devanagari when applicable | |
| text_md = out.get("markdown") if return_markdown else out.get("text") or out.get("markdown", "") | |
| if lang_hint.startswith("Devanagari") or "Hindi" in lang_hint or "Marathi" in lang_hint or "Sanskrit" in lang_hint: | |
| text_md = normalize_devanagari(text_md) | |
| results.append({ | |
| "text": text_md, | |
| "boxes_image": out.get("overlay_image") if show_boxes else None | |
| }) | |
| # Aggregate outputs | |
| combined_text = "\n\n".join([r["text"] for r in results if r["text"]]) | |
| # Save downloadable files | |
| md_path, txt_path = None, None | |
| if return_markdown: | |
| md_path = "output.md" | |
| with open(md_path, "w", encoding="utf-8") as f: | |
| f.write(combined_text) | |
| txt_path = "output.txt" | |
| with open(txt_path, "w", encoding="utf-8") as f: | |
| f.write(re.sub(r"\n{3,}", "\n\n", combined_text)) | |
| overlay_gallery = [r["boxes_image"] for r in results if r["boxes_image"] is not None] | |
| return combined_text, (md_path if return_markdown else None), txt_path, overlay_gallery | |
| with gr.Blocks(title="Dots OCR — Indic-ready") as demo: | |
| gr.Markdown( | |
| "# dots.ocr — Multilingual OCR\n" | |
| "Upload PDFs or images. For Hindi/Marathi/Sanskrit, choose **Devanagari**.\n" | |
| "Outputs: Markdown + Plain text. (Best on GPU)" | |
| ) | |
| with gr.Row(): | |
| files = gr.File(label="PDF(s) or image(s)", file_count="multiple", type="filepath") | |
| return_markdown = gr.Checkbox(True, label="Return Markdown") | |
| with gr.Row(): | |
| task = gr.Radio(choices=["OCR"], value="OCR", label="Task") | |
| lang_hint = gr.Dropdown( | |
| label="Language/Script hint", | |
| choices=["Auto", "Devanagari (Hindi/Marathi/Sanskrit)", "Gujarati", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Punjabi (Gurmukhi)", "Urdu (Arabic)"], | |
| value="Devanagari (Hindi/Marathi/Sanskrit)" | |
| ) | |
| show_boxes = gr.Checkbox(False, label="Show layout boxes (slower)") | |
| btn = gr.Button("Run") | |
| out_text = gr.Textbox(label="Recognized text (Markdown or Plain Text)", lines=20) | |
| out_md = gr.File(label="Download Markdown", visible=True) | |
| out_txt = gr.File(label="Download Plain Text", visible=True) | |
| overlay = gr.Gallery(label="Layout overlays", visible=False) | |
| def _toggle(v): | |
| return gr.update(visible=v) | |
| show_boxes.change(_toggle, inputs=show_boxes, outputs=overlay) | |
| btn.click(run_ocr, inputs=[files, task, lang_hint, return_markdown, show_boxes], | |
| outputs=[out_text, out_md, out_txt, overlay]) | |
| if __name__ == "__main__": | |
| demo.launch() | |