Spaces:

mahdisd
/

ocr

Runtime error

File size: 10,242 Bytes

50d4599

import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import os
import tempfile

# ── Lazy model cache (loaded once per session) ───────────────────────────────
_cache = {}

def load_qari():
    if "qari" not in _cache:
        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
        import torch
        model_id = "NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct"
        _cache["qari"] = {
            "processor": AutoProcessor.from_pretrained(model_id),
            "model": Qwen2VLForConditionalGeneration.from_pretrained(
                model_id,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto",
            ),
        }
    return _cache["qari"]

def load_easyocr():
    if "easyocr" not in _cache:
        import easyocr
        _cache["easyocr"] = easyocr.Reader(["ar"], gpu=False)
    return _cache["easyocr"]

def load_paddle():
    if "paddle" not in _cache:
        from paddleocr import PaddleOCR
        _cache["paddle"] = PaddleOCR(
            use_angle_cls=True, lang="ar", use_gpu=False, show_log=False
        )
    return _cache["paddle"]

# ── QARI-OCR single-image inference ──────────────────────────────────────────
def qari_ocr_image(image, m):
    import torch
    from qwen_vl_utils import process_vision_info

    tmp = "/tmp/qari_page.png"
    image.save(tmp)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{tmp}"},
                {"type": "text",  "text": "Extract all Arabic text from this image. Output only the text, preserving line breaks."},
            ],
        }
    ]

    text_prompt = m["processor"].apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = m["processor"](
        text=[text_prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(m["model"].device)

    with torch.no_grad():
        generated_ids = m["model"].generate(**inputs, max_new_tokens=2048)

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    return m["processor"].batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]


# ── Main OCR runner ───────────────────────────────────────────────────────────
def run_ocr(pdf_file, model_choice, dpi, progress=gr.Progress(track_tqdm=True)):
    if pdf_file is None:
        return "⚠️ Please upload a PDF file first.", None

    try:
        progress(0.05, desc="Converting PDF pages to images…")
        images = convert_from_path(pdf_file, dpi=int(dpi))
        n = len(images)
        all_text = []

        # ── QARI-OCR ──────────────────────────────────────────────────────
        if "QARI" in model_choice:
            progress(0.10, desc="Loading QARI-OCR model (first run: ~2 min, downloads ~4 GB)…")
            m = load_qari()
            for i, image in enumerate(images):
                progress(0.10 + 0.85 * (i / n), desc=f"QARI-OCR — page {i+1}/{n}")
                page_text = qari_ocr_image(image, m)
                all_text.append(f"─── Page {i+1} ───\n{page_text}")

        # ── EasyOCR ───────────────────────────────────────────────────────
        elif "EasyOCR" in model_choice:
            progress(0.10, desc="Loading EasyOCR model (first run: ~30 s)…")
            reader = load_easyocr()
            for i, image in enumerate(images):
                progress(0.10 + 0.85 * (i / n), desc=f"EasyOCR — page {i+1}/{n}")
                tmp = f"/tmp/page_{i}.png"
                image.save(tmp)
                lines = reader.readtext(tmp, detail=0, paragraph=True)
                all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))

        # ── PaddleOCR ─────────────────────────────────────────────────────
        elif "PaddleOCR" in model_choice:
            progress(0.10, desc="Loading PaddleOCR model (first run: ~30 s)…")
            ocr = load_paddle()
            for i, image in enumerate(images):
                progress(0.10 + 0.85 * (i / n), desc=f"PaddleOCR — page {i+1}/{n}")
                tmp = f"/tmp/page_{i}.png"
                image.save(tmp)
                result = ocr.ocr(tmp, cls=True)
                lines = []
                if result and result[0]:
                    for line in result[0]:
                        if line and len(line) >= 2 and line[1]:
                            lines.append(line[1][0])
                all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))

        progress(0.98, desc="Saving output…")
        full_text = "\n\n".join(all_text)

        out_path = "/tmp/arabic_ocr_result.txt"
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(full_text)

        progress(1.0, desc="Done!")
        return full_text, out_path

    except Exception as e:
        import traceback
        return f"❌ Error: {e}\n\n{traceback.format_exc()}", None


# ── Model descriptions ────────────────────────────────────────────────────────
MODEL_INFO = {
    "🥇 QARI-OCR  (Best — built for Arabic)":
        "Fine-tuned vision-language model built specifically for Arabic by the NAMAA Arabic NLP community. "
        "Handles diacritics, mixed fonts, and complex layouts. State-of-the-art accuracy. "
        "First run downloads ~4 GB; subsequent runs are fast.",
    "🥈 EasyOCR  (Fast & accurate)":
        "Deep-learning OCR with strong Arabic support. Great speed/accuracy trade-off. ~30 s to load first time.",
    "🥉 PaddleOCR  (Also excellent)":
        "PP-OCR v4 — very fast and accurate for clean printed Arabic text.",
}

# ── Custom CSS ────────────────────────────────────────────────────────────────
CSS = """
#title    { text-align: center; }
#subtitle { text-align: center; color: #666; margin-top: -10px; }
#run-btn  { font-size: 1.1em !important; }
.arabic-out textarea {
    direction: rtl !important;
    text-align: right !important;
    font-size: 15px !important;
    line-height: 1.9 !important;
    font-family: 'Amiri', 'Scheherazade New', 'Arabic Typesetting', serif !important;
}
.model-note {
    font-size: 0.85em;
    color: #666;
    margin-top: -6px;
    padding: 4px 6px;
    background: #f8f8f8;
    border-radius: 6px;
}
footer { display: none !important; }
"""

# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(css=CSS, title="Arabic PDF OCR", theme=gr.themes.Soft()) as demo:

    gr.HTML("<h1 id='title'>🕌 Arabic PDF OCR</h1>")
    gr.HTML("<p id='subtitle'>Upload an Arabic PDF → pick a model → extract text. Free & open-source.</p>")

    with gr.Row(equal_height=False):

        # Left — controls
        with gr.Column(scale=1, min_width=300):
            pdf_input = gr.File(
                label="📎 Upload Arabic PDF",
                file_types=[".pdf"],
                height=160,
            )
            model_choice = gr.Dropdown(
                choices=list(MODEL_INFO.keys()),
                value="🥇 QARI-OCR  (Best — built for Arabic)",
                label="🤖 OCR Model",
                interactive=True,
            )
            model_note = gr.Markdown(
                MODEL_INFO["🥇 QARI-OCR  (Best — built for Arabic)"],
                elem_classes=["model-note"],
            )
            dpi = gr.Slider(
                minimum=150, maximum=400, value=300, step=50,
                label="📐 Scan quality (DPI)",
                info="300 is ideal. Use 400 for blurry or small text.",
            )
            run_btn = gr.Button(
                "🔍  Extract Text", variant="primary",
                size="lg", elem_id="run-btn",
            )

        # Right — output
        with gr.Column(scale=2):
            text_out = gr.Textbox(
                label="📝 Extracted Text",
                lines=22,
                placeholder="Your Arabic text will appear here after extraction…",
                show_copy_button=True,
                elem_classes=["arabic-out"],
            )
            file_out = gr.File(label="💾 Download as .txt", interactive=False)

    def update_note(choice):
        return MODEL_INFO.get(choice, "")

    model_choice.change(update_note, inputs=model_choice, outputs=model_note)

    run_btn.click(
        fn=run_ocr,
        inputs=[pdf_input, model_choice, dpi],
        outputs=[text_out, file_out],
    )

    gr.HTML("""
        <div style="text-align:center;margin-top:20px;color:#aaa;font-size:0.82em">
            Powered by open-source engines &nbsp;·&nbsp;
            <a href="https://huggingface.co/NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" target="_blank">QARI-OCR (NAMAA)</a> &nbsp;·&nbsp;
            <a href="https://github.com/JaidedAI/EasyOCR" target="_blank">EasyOCR</a> &nbsp;·&nbsp;
            <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a>
        </div>
    """)

demo.launch()