Spaces:

mahdisd
/

ocr

Runtime error

App Files Files Community

ocr / app.py

mahdisd

Upload 4 files

50d4599 verified about 2 months ago

raw

history blame contribute delete

10.2 kB

	import gradio as gr
	from pdf2image import convert_from_path
	from PIL import Image
	import os
	import tempfile

	# ── Lazy model cache (loaded once per session) ───────────────────────────────
	_cache = {}

	def load_qari():
	if "qari" not in _cache:
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
	import torch
	model_id = "NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct"
	_cache["qari"] = {
	"processor": AutoProcessor.from_pretrained(model_id),
	"model": Qwen2VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	),
	}
	return _cache["qari"]

	def load_easyocr():
	if "easyocr" not in _cache:
	import easyocr
	_cache["easyocr"] = easyocr.Reader(["ar"], gpu=False)
	return _cache["easyocr"]

	def load_paddle():
	if "paddle" not in _cache:
	from paddleocr import PaddleOCR
	_cache["paddle"] = PaddleOCR(
	use_angle_cls=True, lang="ar", use_gpu=False, show_log=False
	)
	return _cache["paddle"]

	# ── QARI-OCR single-image inference ──────────────────────────────────────────
	def qari_ocr_image(image, m):
	import torch
	from qwen_vl_utils import process_vision_info

	tmp = "/tmp/qari_page.png"
	image.save(tmp)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{tmp}"},
	{"type": "text", "text": "Extract all Arabic text from this image. Output only the text, preserving line breaks."},
	],
	}
	]

	text_prompt = m["processor"].apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = m["processor"](
	text=[text_prompt],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to(m["model"].device)

	with torch.no_grad():
	generated_ids = m["model"].generate(**inputs, max_new_tokens=2048)

	generated_ids_trimmed = [
	out_ids[len(in_ids):]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	return m["processor"].batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)[0]


	# ── Main OCR runner ───────────────────────────────────────────────────────────
	def run_ocr(pdf_file, model_choice, dpi, progress=gr.Progress(track_tqdm=True)):
	if pdf_file is None:
	return "⚠️ Please upload a PDF file first.", None

	try:
	progress(0.05, desc="Converting PDF pages to images…")
	images = convert_from_path(pdf_file, dpi=int(dpi))
	n = len(images)
	all_text = []

	# ── QARI-OCR ──────────────────────────────────────────────────────
	if "QARI" in model_choice:
	progress(0.10, desc="Loading QARI-OCR model (first run: ~2 min, downloads ~4 GB)…")
	m = load_qari()
	for i, image in enumerate(images):
	progress(0.10 + 0.85 * (i / n), desc=f"QARI-OCR — page {i+1}/{n}")
	page_text = qari_ocr_image(image, m)
	all_text.append(f"─── Page {i+1} ───\n{page_text}")

	# ── EasyOCR ───────────────────────────────────────────────────────
	elif "EasyOCR" in model_choice:
	progress(0.10, desc="Loading EasyOCR model (first run: ~30 s)…")
	reader = load_easyocr()
	for i, image in enumerate(images):
	progress(0.10 + 0.85 * (i / n), desc=f"EasyOCR — page {i+1}/{n}")
	tmp = f"/tmp/page_{i}.png"
	image.save(tmp)
	lines = reader.readtext(tmp, detail=0, paragraph=True)
	all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))

	# ── PaddleOCR ─────────────────────────────────────────────────────
	elif "PaddleOCR" in model_choice:
	progress(0.10, desc="Loading PaddleOCR model (first run: ~30 s)…")
	ocr = load_paddle()
	for i, image in enumerate(images):
	progress(0.10 + 0.85 * (i / n), desc=f"PaddleOCR — page {i+1}/{n}")
	tmp = f"/tmp/page_{i}.png"
	image.save(tmp)
	result = ocr.ocr(tmp, cls=True)
	lines = []
	if result and result[0]:
	for line in result[0]:
	if line and len(line) >= 2 and line[1]:
	lines.append(line[1][0])
	all_text.append(f"─── Page {i+1} ───\n" + "\n".join(lines))

	progress(0.98, desc="Saving output…")
	full_text = "\n\n".join(all_text)

	out_path = "/tmp/arabic_ocr_result.txt"
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(full_text)

	progress(1.0, desc="Done!")
	return full_text, out_path

	except Exception as e:
	import traceback
	return f"❌ Error: {e}\n\n{traceback.format_exc()}", None


	# ── Model descriptions ────────────────────────────────────────────────────────
	MODEL_INFO = {
	"🥇 QARI-OCR (Best — built for Arabic)":
	"Fine-tuned vision-language model built specifically for Arabic by the NAMAA Arabic NLP community. "
	"Handles diacritics, mixed fonts, and complex layouts. State-of-the-art accuracy. "
	"First run downloads ~4 GB; subsequent runs are fast.",
	"🥈 EasyOCR (Fast & accurate)":
	"Deep-learning OCR with strong Arabic support. Great speed/accuracy trade-off. ~30 s to load first time.",
	"🥉 PaddleOCR (Also excellent)":
	"PP-OCR v4 — very fast and accurate for clean printed Arabic text.",
	}

	# ── Custom CSS ────────────────────────────────────────────────────────────────
	CSS = """
	#title { text-align: center; }
	#subtitle { text-align: center; color: #666; margin-top: -10px; }
	#run-btn { font-size: 1.1em !important; }
	.arabic-out textarea {
	direction: rtl !important;
	text-align: right !important;
	font-size: 15px !important;
	line-height: 1.9 !important;
	font-family: 'Amiri', 'Scheherazade New', 'Arabic Typesetting', serif !important;
	}
	.model-note {
	font-size: 0.85em;
	color: #666;
	margin-top: -6px;
	padding: 4px 6px;
	background: #f8f8f8;
	border-radius: 6px;
	}
	footer { display: none !important; }
	"""

	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	with gr.Blocks(css=CSS, title="Arabic PDF OCR", theme=gr.themes.Soft()) as demo:

	gr.HTML("<h1 id='title'>🕌 Arabic PDF OCR</h1>")
	gr.HTML("<p id='subtitle'>Upload an Arabic PDF → pick a model → extract text. Free & open-source.</p>")

	with gr.Row(equal_height=False):

	# Left — controls
	with gr.Column(scale=1, min_width=300):
	pdf_input = gr.File(
	label="📎 Upload Arabic PDF",
	file_types=[".pdf"],
	height=160,
	)
	model_choice = gr.Dropdown(
	choices=list(MODEL_INFO.keys()),
	value="🥇 QARI-OCR (Best — built for Arabic)",
	label="🤖 OCR Model",
	interactive=True,
	)
	model_note = gr.Markdown(
	MODEL_INFO["🥇 QARI-OCR (Best — built for Arabic)"],
	elem_classes=["model-note"],
	)
	dpi = gr.Slider(
	minimum=150, maximum=400, value=300, step=50,
	label="📐 Scan quality (DPI)",
	info="300 is ideal. Use 400 for blurry or small text.",
	)
	run_btn = gr.Button(
	"🔍 Extract Text", variant="primary",
	size="lg", elem_id="run-btn",
	)

	# Right — output
	with gr.Column(scale=2):
	text_out = gr.Textbox(
	label="📝 Extracted Text",
	lines=22,
	placeholder="Your Arabic text will appear here after extraction…",
	show_copy_button=True,
	elem_classes=["arabic-out"],
	)
	file_out = gr.File(label="💾 Download as .txt", interactive=False)

	def update_note(choice):
	return MODEL_INFO.get(choice, "")

	model_choice.change(update_note, inputs=model_choice, outputs=model_note)

	run_btn.click(
	fn=run_ocr,
	inputs=[pdf_input, model_choice, dpi],
	outputs=[text_out, file_out],
	)

	gr.HTML("""
	<div style="text-align:center;margin-top:20px;color:#aaa;font-size:0.82em">
	Powered by open-source engines  ·
	<a href="https://huggingface.co/NAMAA-Space/Qari-OCR-v0.3-VL-2B-Instruct" target="_blank">QARI-OCR (NAMAA)</a>  ·
	<a href="https://github.com/JaidedAI/EasyOCR" target="_blank">EasyOCR</a>  ·
	<a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a>
	</div>
	""")

	demo.launch()