ocr / app.py
mahdisd's picture
Upload 4 files
50d4599 verified
import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import os
import tempfile
# ── Lazy model cache (loaded once per session) ───────────────────────────────
_cache = {}
def load_qari():
if "qari" not in _cache:
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
import torch
model_id = "NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct"
_cache["qari"] = {
"processor": AutoProcessor.from_pretrained(model_id),
"model": Qwen2VLForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
),
}
return _cache["qari"]
def load_easyocr():
if "easyocr" not in _cache:
import easyocr
_cache["easyocr"] = easyocr.Reader(["ar"], gpu=False)
return _cache["easyocr"]
def load_paddle():
if "paddle" not in _cache:
from paddleocr import PaddleOCR
_cache["paddle"] = PaddleOCR(
use_angle_cls=True, lang="ar", use_gpu=False, show_log=False
)
return _cache["paddle"]
# ── QARI-OCR single-image inference ──────────────────────────────────────────
def qari_ocr_image(image, m):
import torch
from qwen_vl_utils import process_vision_info
tmp = "/tmp/qari_page.png"
image.save(tmp)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"file://{tmp}"},
{"type": "text", "text": "Extract all Arabic text from this image. Output only the text, preserving line breaks."},
],
}
]
text_prompt = m["processor"].apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = m["processor"](
text=[text_prompt],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(m["model"].device)
with torch.no_grad():
generated_ids = m["model"].generate(**inputs, max_new_tokens=2048)
generated_ids_trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
return m["processor"].batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
# ── Main OCR runner ───────────────────────────────────────────────────────────
def run_ocr(pdf_file, model_choice, dpi, progress=gr.Progress(track_tqdm=True)):
if pdf_file is None:
return "⚠️ Please upload a PDF file first.", None
try:
progress(0.05, desc="Converting PDF pages to images…")
images = convert_from_path(pdf_file, dpi=int(dpi))
n = len(images)
all_text = []
# ── QARI-OCR ──────────────────────────────────────────────────────
if "QARI" in model_choice:
progress(0.10, desc="Loading QARI-OCR model (first run: ~2 min, downloads ~4 GB)…")
m = load_qari()
for i, image in enumerate(images):
progress(0.10 + 0.85 * (i / n), desc=f"QARI-OCR β€” page {i+1}/{n}")
page_text = qari_ocr_image(image, m)
all_text.append(f"─── Page {i+1} ───\n{page_text}")
# ── EasyOCR ───────────────────────────────────────────────────────
elif "EasyOCR" in model_choice:
progress(0.10, desc="Loading EasyOCR model (first run: ~30 s)…")
reader = load_easyocr()
for i, image in enumerate(images):
progress(0.10 + 0.85 * (i / n), desc=f"EasyOCR β€” page {i+1}/{n}")
tmp = f"/tmp/page_{i}.png"
image.save(tmp)
lines = reader.readtext(tmp, detail=0, paragraph=True)
all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))
# ── PaddleOCR ─────────────────────────────────────────────────────
elif "PaddleOCR" in model_choice:
progress(0.10, desc="Loading PaddleOCR model (first run: ~30 s)…")
ocr = load_paddle()
for i, image in enumerate(images):
progress(0.10 + 0.85 * (i / n), desc=f"PaddleOCR β€” page {i+1}/{n}")
tmp = f"/tmp/page_{i}.png"
image.save(tmp)
result = ocr.ocr(tmp, cls=True)
lines = []
if result and result[0]:
for line in result[0]:
if line and len(line) >= 2 and line[1]:
lines.append(line[1][0])
all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))
progress(0.98, desc="Saving output…")
full_text = "\n\n".join(all_text)
out_path = "/tmp/arabic_ocr_result.txt"
with open(out_path, "w", encoding="utf-8") as f:
f.write(full_text)
progress(1.0, desc="Done!")
return full_text, out_path
except Exception as e:
import traceback
return f"❌ Error: {e}\n\n{traceback.format_exc()}", None
# ── Model descriptions ────────────────────────────────────────────────────────
MODEL_INFO = {
"πŸ₯‡ QARI-OCR (Best β€” built for Arabic)":
"Fine-tuned vision-language model built specifically for Arabic by the NAMAA Arabic NLP community. "
"Handles diacritics, mixed fonts, and complex layouts. State-of-the-art accuracy. "
"First run downloads ~4 GB; subsequent runs are fast.",
"πŸ₯ˆ EasyOCR (Fast & accurate)":
"Deep-learning OCR with strong Arabic support. Great speed/accuracy trade-off. ~30 s to load first time.",
"πŸ₯‰ PaddleOCR (Also excellent)":
"PP-OCR v4 β€” very fast and accurate for clean printed Arabic text.",
}
# ── Custom CSS ────────────────────────────────────────────────────────────────
CSS = """
#title { text-align: center; }
#subtitle { text-align: center; color: #666; margin-top: -10px; }
#run-btn { font-size: 1.1em !important; }
.arabic-out textarea {
direction: rtl !important;
text-align: right !important;
font-size: 15px !important;
line-height: 1.9 !important;
font-family: 'Amiri', 'Scheherazade New', 'Arabic Typesetting', serif !important;
}
.model-note {
font-size: 0.85em;
color: #666;
margin-top: -6px;
padding: 4px 6px;
background: #f8f8f8;
border-radius: 6px;
}
footer { display: none !important; }
"""
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(css=CSS, title="Arabic PDF OCR", theme=gr.themes.Soft()) as demo:
gr.HTML("<h1 id='title'>πŸ•Œ Arabic PDF OCR</h1>")
gr.HTML("<p id='subtitle'>Upload an Arabic PDF β†’ pick a model β†’ extract text. Free & open-source.</p>")
with gr.Row(equal_height=False):
# Left β€” controls
with gr.Column(scale=1, min_width=300):
pdf_input = gr.File(
label="πŸ“Ž Upload Arabic PDF",
file_types=[".pdf"],
height=160,
)
model_choice = gr.Dropdown(
choices=list(MODEL_INFO.keys()),
value="πŸ₯‡ QARI-OCR (Best β€” built for Arabic)",
label="πŸ€– OCR Model",
interactive=True,
)
model_note = gr.Markdown(
MODEL_INFO["πŸ₯‡ QARI-OCR (Best β€” built for Arabic)"],
elem_classes=["model-note"],
)
dpi = gr.Slider(
minimum=150, maximum=400, value=300, step=50,
label="πŸ“ Scan quality (DPI)",
info="300 is ideal. Use 400 for blurry or small text.",
)
run_btn = gr.Button(
"πŸ” Extract Text", variant="primary",
size="lg", elem_id="run-btn",
)
# Right β€” output
with gr.Column(scale=2):
text_out = gr.Textbox(
label="πŸ“ Extracted Text",
lines=22,
placeholder="Your Arabic text will appear here after extraction…",
show_copy_button=True,
elem_classes=["arabic-out"],
)
file_out = gr.File(label="πŸ’Ύ Download as .txt", interactive=False)
def update_note(choice):
return MODEL_INFO.get(choice, "")
model_choice.change(update_note, inputs=model_choice, outputs=model_note)
run_btn.click(
fn=run_ocr,
inputs=[pdf_input, model_choice, dpi],
outputs=[text_out, file_out],
)
gr.HTML("""
<div style="text-align:center;margin-top:20px;color:#aaa;font-size:0.82em">
Powered by open-source engines &nbsp;Β·&nbsp;
<a href="https://huggingface.co/NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" target="_blank">QARI-OCR (NAMAA)</a> &nbsp;Β·&nbsp;
<a href="https://github.com/JaidedAI/EasyOCR" target="_blank">EasyOCR</a> &nbsp;Β·&nbsp;
<a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a>
</div>
""")
demo.launch()