import gradio as gr from pdf2image import convert_from_path from PIL import Image import os import tempfile # ── Lazy model cache (loaded once per session) ─────────────────────────────── _cache = {} def load_qari(): if "qari" not in _cache: from transformers import AutoProcessor, Qwen2VLForConditionalGeneration import torch model_id = "NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" _cache["qari"] = { "processor": AutoProcessor.from_pretrained(model_id), "model": Qwen2VLForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", ), } return _cache["qari"] def load_easyocr(): if "easyocr" not in _cache: import easyocr _cache["easyocr"] = easyocr.Reader(["ar"], gpu=False) return _cache["easyocr"] def load_paddle(): if "paddle" not in _cache: from paddleocr import PaddleOCR _cache["paddle"] = PaddleOCR( use_angle_cls=True, lang="ar", use_gpu=False, show_log=False ) return _cache["paddle"] # ── QARI-OCR single-image inference ────────────────────────────────────────── def qari_ocr_image(image, m): import torch from qwen_vl_utils import process_vision_info tmp = "/tmp/qari_page.png" image.save(tmp) messages = [ { "role": "user", "content": [ {"type": "image", "image": f"file://{tmp}"}, {"type": "text", "text": "Extract all Arabic text from this image. Output only the text, preserving line breaks."}, ], } ] text_prompt = m["processor"].apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = m["processor"]( text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(m["model"].device) with torch.no_grad(): generated_ids = m["model"].generate(**inputs, max_new_tokens=2048) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] return m["processor"].batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] # ── Main OCR runner ─────────────────────────────────────────────────────────── def run_ocr(pdf_file, model_choice, dpi, progress=gr.Progress(track_tqdm=True)): if pdf_file is None: return "⚠️ Please upload a PDF file first.", None try: progress(0.05, desc="Converting PDF pages to images…") images = convert_from_path(pdf_file, dpi=int(dpi)) n = len(images) all_text = [] # ── QARI-OCR ────────────────────────────────────────────────────── if "QARI" in model_choice: progress(0.10, desc="Loading QARI-OCR model (first run: ~2 min, downloads ~4 GB)…") m = load_qari() for i, image in enumerate(images): progress(0.10 + 0.85 * (i / n), desc=f"QARI-OCR — page {i+1}/{n}") page_text = qari_ocr_image(image, m) all_text.append(f"─── Page {i+1} ───\n{page_text}") # ── EasyOCR ─────────────────────────────────────────────────────── elif "EasyOCR" in model_choice: progress(0.10, desc="Loading EasyOCR model (first run: ~30 s)…") reader = load_easyocr() for i, image in enumerate(images): progress(0.10 + 0.85 * (i / n), desc=f"EasyOCR — page {i+1}/{n}") tmp = f"/tmp/page_{i}.png" image.save(tmp) lines = reader.readtext(tmp, detail=0, paragraph=True) all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines)) # ── PaddleOCR ───────────────────────────────────────────────────── elif "PaddleOCR" in model_choice: progress(0.10, desc="Loading PaddleOCR model (first run: ~30 s)…") ocr = load_paddle() for i, image in enumerate(images): progress(0.10 + 0.85 * (i / n), desc=f"PaddleOCR — page {i+1}/{n}") tmp = f"/tmp/page_{i}.png" image.save(tmp) result = ocr.ocr(tmp, cls=True) lines = [] if result and result[0]: for line in result[0]: if line and len(line) >= 2 and line[1]: lines.append(line[1][0]) all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines)) progress(0.98, desc="Saving output…") full_text = "\n\n".join(all_text) out_path = "/tmp/arabic_ocr_result.txt" with open(out_path, "w", encoding="utf-8") as f: f.write(full_text) progress(1.0, desc="Done!") return full_text, out_path except Exception as e: import traceback return f"❌ Error: {e}\n\n{traceback.format_exc()}", None # ── Model descriptions ──────────────────────────────────────────────────────── MODEL_INFO = { "🥇 QARI-OCR (Best — built for Arabic)": "Fine-tuned vision-language model built specifically for Arabic by the NAMAA Arabic NLP community. " "Handles diacritics, mixed fonts, and complex layouts. State-of-the-art accuracy. " "First run downloads ~4 GB; subsequent runs are fast.", "🥈 EasyOCR (Fast & accurate)": "Deep-learning OCR with strong Arabic support. Great speed/accuracy trade-off. ~30 s to load first time.", "🥉 PaddleOCR (Also excellent)": "PP-OCR v4 — very fast and accurate for clean printed Arabic text.", } # ── Custom CSS ──────────────────────────────────────────────────────────────── CSS = """ #title { text-align: center; } #subtitle { text-align: center; color: #666; margin-top: -10px; } #run-btn { font-size: 1.1em !important; } .arabic-out textarea { direction: rtl !important; text-align: right !important; font-size: 15px !important; line-height: 1.9 !important; font-family: 'Amiri', 'Scheherazade New', 'Arabic Typesetting', serif !important; } .model-note { font-size: 0.85em; color: #666; margin-top: -6px; padding: 4px 6px; background: #f8f8f8; border-radius: 6px; } footer { display: none !important; } """ # ── Gradio UI ───────────────────────────────────────────────────────────────── with gr.Blocks(css=CSS, title="Arabic PDF OCR", theme=gr.themes.Soft()) as demo: gr.HTML("

🕌 Arabic PDF OCR

") gr.HTML("

Upload an Arabic PDF → pick a model → extract text. Free & open-source.

") with gr.Row(equal_height=False): # Left — controls with gr.Column(scale=1, min_width=300): pdf_input = gr.File( label="📎 Upload Arabic PDF", file_types=[".pdf"], height=160, ) model_choice = gr.Dropdown( choices=list(MODEL_INFO.keys()), value="🥇 QARI-OCR (Best — built for Arabic)", label="🤖 OCR Model", interactive=True, ) model_note = gr.Markdown( MODEL_INFO["🥇 QARI-OCR (Best — built for Arabic)"], elem_classes=["model-note"], ) dpi = gr.Slider( minimum=150, maximum=400, value=300, step=50, label="📐 Scan quality (DPI)", info="300 is ideal. Use 400 for blurry or small text.", ) run_btn = gr.Button( "🔍 Extract Text", variant="primary", size="lg", elem_id="run-btn", ) # Right — output with gr.Column(scale=2): text_out = gr.Textbox( label="📝 Extracted Text", lines=22, placeholder="Your Arabic text will appear here after extraction…", show_copy_button=True, elem_classes=["arabic-out"], ) file_out = gr.File(label="💾 Download as .txt", interactive=False) def update_note(choice): return MODEL_INFO.get(choice, "") model_choice.change(update_note, inputs=model_choice, outputs=model_note) run_btn.click( fn=run_ocr, inputs=[pdf_input, model_choice, dpi], outputs=[text_out, file_out], ) gr.HTML("""
Powered by open-source engines  ·  QARI-OCR (NAMAA)  ·  EasyOCR  ·  PaddleOCR
""") demo.launch()