| import gradio as gr |
| from pdf2image import convert_from_path |
| from PIL import Image |
| import os |
| import tempfile |
|
|
| |
| _cache = {} |
|
|
| def load_qari(): |
| if "qari" not in _cache: |
| from transformers import AutoProcessor, Qwen2VLForConditionalGeneration |
| import torch |
| model_id = "NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" |
| _cache["qari"] = { |
| "processor": AutoProcessor.from_pretrained(model_id), |
| "model": Qwen2VLForConditionalGeneration.from_pretrained( |
| model_id, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto", |
| ), |
| } |
| return _cache["qari"] |
|
|
| def load_easyocr(): |
| if "easyocr" not in _cache: |
| import easyocr |
| _cache["easyocr"] = easyocr.Reader(["ar"], gpu=False) |
| return _cache["easyocr"] |
|
|
| def load_paddle(): |
| if "paddle" not in _cache: |
| from paddleocr import PaddleOCR |
| _cache["paddle"] = PaddleOCR( |
| use_angle_cls=True, lang="ar", use_gpu=False, show_log=False |
| ) |
| return _cache["paddle"] |
|
|
| |
| def qari_ocr_image(image, m): |
| import torch |
| from qwen_vl_utils import process_vision_info |
|
|
| tmp = "/tmp/qari_page.png" |
| image.save(tmp) |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": f"file://{tmp}"}, |
| {"type": "text", "text": "Extract all Arabic text from this image. Output only the text, preserving line breaks."}, |
| ], |
| } |
| ] |
|
|
| text_prompt = m["processor"].apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| image_inputs, video_inputs = process_vision_info(messages) |
| inputs = m["processor"]( |
| text=[text_prompt], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ).to(m["model"].device) |
|
|
| with torch.no_grad(): |
| generated_ids = m["model"].generate(**inputs, max_new_tokens=2048) |
|
|
| generated_ids_trimmed = [ |
| out_ids[len(in_ids):] |
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| return m["processor"].batch_decode( |
| generated_ids_trimmed, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False, |
| )[0] |
|
|
|
|
| |
| def run_ocr(pdf_file, model_choice, dpi, progress=gr.Progress(track_tqdm=True)): |
| if pdf_file is None: |
| return "β οΈ Please upload a PDF file first.", None |
|
|
| try: |
| progress(0.05, desc="Converting PDF pages to imagesβ¦") |
| images = convert_from_path(pdf_file, dpi=int(dpi)) |
| n = len(images) |
| all_text = [] |
|
|
| |
| if "QARI" in model_choice: |
| progress(0.10, desc="Loading QARI-OCR model (first run: ~2 min, downloads ~4 GB)β¦") |
| m = load_qari() |
| for i, image in enumerate(images): |
| progress(0.10 + 0.85 * (i / n), desc=f"QARI-OCR β page {i+1}/{n}") |
| page_text = qari_ocr_image(image, m) |
| all_text.append(f"βββ Page {i+1} βββ\n{page_text}") |
|
|
| |
| elif "EasyOCR" in model_choice: |
| progress(0.10, desc="Loading EasyOCR model (first run: ~30 s)β¦") |
| reader = load_easyocr() |
| for i, image in enumerate(images): |
| progress(0.10 + 0.85 * (i / n), desc=f"EasyOCR β page {i+1}/{n}") |
| tmp = f"/tmp/page_{i}.png" |
| image.save(tmp) |
| lines = reader.readtext(tmp, detail=0, paragraph=True) |
| all_text.append(f"βββ Page {i+1} βββ\n" + "\n".join(lines)) |
|
|
| |
| elif "PaddleOCR" in model_choice: |
| progress(0.10, desc="Loading PaddleOCR model (first run: ~30 s)β¦") |
| ocr = load_paddle() |
| for i, image in enumerate(images): |
| progress(0.10 + 0.85 * (i / n), desc=f"PaddleOCR β page {i+1}/{n}") |
| tmp = f"/tmp/page_{i}.png" |
| image.save(tmp) |
| result = ocr.ocr(tmp, cls=True) |
| lines = [] |
| if result and result[0]: |
| for line in result[0]: |
| if line and len(line) >= 2 and line[1]: |
| lines.append(line[1][0]) |
| all_text.append(f"βββ Page {i+1} βββ\n" + "\n".join(lines)) |
|
|
| progress(0.98, desc="Saving outputβ¦") |
| full_text = "\n\n".join(all_text) |
|
|
| out_path = "/tmp/arabic_ocr_result.txt" |
| with open(out_path, "w", encoding="utf-8") as f: |
| f.write(full_text) |
|
|
| progress(1.0, desc="Done!") |
| return full_text, out_path |
|
|
| except Exception as e: |
| import traceback |
| return f"β Error: {e}\n\n{traceback.format_exc()}", None |
|
|
|
|
| |
| MODEL_INFO = { |
| "π₯ QARI-OCR (Best β built for Arabic)": |
| "Fine-tuned vision-language model built specifically for Arabic by the NAMAA Arabic NLP community. " |
| "Handles diacritics, mixed fonts, and complex layouts. State-of-the-art accuracy. " |
| "First run downloads ~4 GB; subsequent runs are fast.", |
| "π₯ EasyOCR (Fast & accurate)": |
| "Deep-learning OCR with strong Arabic support. Great speed/accuracy trade-off. ~30 s to load first time.", |
| "π₯ PaddleOCR (Also excellent)": |
| "PP-OCR v4 β very fast and accurate for clean printed Arabic text.", |
| } |
|
|
| |
| CSS = """ |
| #title { text-align: center; } |
| #subtitle { text-align: center; color: #666; margin-top: -10px; } |
| #run-btn { font-size: 1.1em !important; } |
| .arabic-out textarea { |
| direction: rtl !important; |
| text-align: right !important; |
| font-size: 15px !important; |
| line-height: 1.9 !important; |
| font-family: 'Amiri', 'Scheherazade New', 'Arabic Typesetting', serif !important; |
| } |
| .model-note { |
| font-size: 0.85em; |
| color: #666; |
| margin-top: -6px; |
| padding: 4px 6px; |
| background: #f8f8f8; |
| border-radius: 6px; |
| } |
| footer { display: none !important; } |
| """ |
|
|
| |
| with gr.Blocks(css=CSS, title="Arabic PDF OCR", theme=gr.themes.Soft()) as demo: |
|
|
| gr.HTML("<h1 id='title'>π Arabic PDF OCR</h1>") |
| gr.HTML("<p id='subtitle'>Upload an Arabic PDF β pick a model β extract text. Free & open-source.</p>") |
|
|
| with gr.Row(equal_height=False): |
|
|
| |
| with gr.Column(scale=1, min_width=300): |
| pdf_input = gr.File( |
| label="π Upload Arabic PDF", |
| file_types=[".pdf"], |
| height=160, |
| ) |
| model_choice = gr.Dropdown( |
| choices=list(MODEL_INFO.keys()), |
| value="π₯ QARI-OCR (Best β built for Arabic)", |
| label="π€ OCR Model", |
| interactive=True, |
| ) |
| model_note = gr.Markdown( |
| MODEL_INFO["π₯ QARI-OCR (Best β built for Arabic)"], |
| elem_classes=["model-note"], |
| ) |
| dpi = gr.Slider( |
| minimum=150, maximum=400, value=300, step=50, |
| label="π Scan quality (DPI)", |
| info="300 is ideal. Use 400 for blurry or small text.", |
| ) |
| run_btn = gr.Button( |
| "π Extract Text", variant="primary", |
| size="lg", elem_id="run-btn", |
| ) |
|
|
| |
| with gr.Column(scale=2): |
| text_out = gr.Textbox( |
| label="π Extracted Text", |
| lines=22, |
| placeholder="Your Arabic text will appear here after extractionβ¦", |
| show_copy_button=True, |
| elem_classes=["arabic-out"], |
| ) |
| file_out = gr.File(label="πΎ Download as .txt", interactive=False) |
|
|
| def update_note(choice): |
| return MODEL_INFO.get(choice, "") |
|
|
| model_choice.change(update_note, inputs=model_choice, outputs=model_note) |
|
|
| run_btn.click( |
| fn=run_ocr, |
| inputs=[pdf_input, model_choice, dpi], |
| outputs=[text_out, file_out], |
| ) |
|
|
| gr.HTML(""" |
| <div style="text-align:center;margin-top:20px;color:#aaa;font-size:0.82em"> |
| Powered by open-source engines Β· |
| <a href="https://huggingface.co/NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" target="_blank">QARI-OCR (NAMAA)</a> Β· |
| <a href="https://github.com/JaidedAI/EasyOCR" target="_blank">EasyOCR</a> Β· |
| <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a> |
| </div> |
| """) |
|
|
| demo.launch() |
|
|