Spaces:

Zarm33na
/

bilingual-ocr-api

Sleeping

App Files Files Community

bilingual-ocr-api / script_detection.py

Zarm33na

Initial deployment: bilingual OCR API (Urdu + English)

04f9475 3 months ago

raw

history blame contribute delete

5.12 kB

	"""
	Script detection for OCR pipelines: Urdu vs English.

	Uses lightweight pytesseract OCR and Unicode range checks only.
	No ML models, no training. Explainable and FYP-safe.
	Reusable by future pipelines (bilingual routing, etc.).
	"""

	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path
	from typing import Literal

	# ---------------------------------------------------------------------------
	# Unicode ranges (explainable classification)
	# ---------------------------------------------------------------------------
	# Arabic block: used for Urdu and Arabic script (FYP-safe heuristic).
	ARABIC_START = 0x0600
	ARABIC_END = 0x06FF


	def _has_arabic_script(text: str) -> bool:
	"""
	Return True if any character in text falls in the Arabic Unicode block (U+0600–U+06FF).

	Urdu uses the Arabic script, so presence of this range indicates Urdu (or Arabic).
	No ML; purely explainable Unicode check.
	"""
	if not text:
	return False
	for char in text:
	if ARABIC_START <= ord(char) <= ARABIC_END:
	return True
	return False


	def _tesseract_string(image_arg, *, lang: str) -> str:
	"""Run Tesseract and return text; empty on any error."""
	try:
	import pytesseract
	if isinstance(image_arg, (str, Path)):
	return pytesseract.image_to_string(str(image_arg), lang=lang) or ""
	from PIL import Image
	import numpy as np
	arr = image_arg
	if len(arr.shape) == 3:
	arr = arr[:, :, ::-1]
	pil = Image.fromarray(arr)
	return pytesseract.image_to_string(pil, lang=lang) or ""
	except Exception:
	return ""


	def _has_latin_letters(text: str) -> bool:
	"""True if text has at least one Latin letter (A-Z, a-z)."""
	for c in text:
	if "a" <= c <= "z" or "A" <= c <= "Z":
	return True
	return False


	def detect_script_page(image: str \| Path) -> Literal["urdu"] \| None:
	"""
	Page-level script check: only treat page as "all Urdu" when page has Arabic and no clear English.

	Run Tesseract with lang="eng" first on the full page. If eng output has Latin letters → None
	(page is English or mixed; use per-crop detection). If eng is empty or has no Latin, run lang="ara";
	if ara has Arabic → "urdu" (treat all crops on this page as Urdu). Else → None.
	This keeps all-English pages (e.g. QSL card) from being forced to Urdu when ara returns noise.
	"""
	if isinstance(image, (str, Path)) and not Path(image).is_file():
	return None
	text_eng = _tesseract_string(image, lang="eng")
	if text_eng.strip() and _has_latin_letters(text_eng):
	return None
	text_ara = _tesseract_string(image, lang="ara")
	return "urdu" if _has_arabic_script(text_ara) else None


	def detect_script(image: str \| Path \| "np.ndarray") -> dict:
	"""
	Detect script (Urdu vs English): English only when eng output clearly has Latin text.

	1. Run Tesseract with lang="eng" first.
	2. If output has Arabic Unicode → "urdu".
	3. If output has Latin letters (A–Z, a–z) and no Arabic → "english".
	4. If output is empty or has no Latin letters (e.g. numbers only, or Urdu crop) → try lang="ara";
	if ara has Arabic → "urdu", else "english".
	So: all-English pages get Latin from eng → English; Urdu crops get little/no Latin from eng, we try ara → Urdu.
	"""
	if isinstance(image, (str, Path)) and not Path(image).is_file():
	return {"script": "english", "confidence": "heuristic"}

	text_eng = _tesseract_string(image, lang="eng")
	if _has_arabic_script(text_eng):
	return {"script": "urdu", "confidence": "heuristic"}
	if text_eng.strip() and _has_latin_letters(text_eng):
	return {"script": "english", "confidence": "heuristic"}

	text_ara = _tesseract_string(image, lang="ara")
	script: Literal["urdu", "english"] = "urdu" if _has_arabic_script(text_ara) else "english"
	return {"script": script, "confidence": "heuristic"}


	# ---------------------------------------------------------------------------
	# Main: test detection on sample images
	# ---------------------------------------------------------------------------
	def _main() -> None:
	parser = argparse.ArgumentParser(
	description="Detect script (Urdu vs English) from images using pytesseract + Unicode checks.",
	)
	parser.add_argument(
	"images",
	type=Path,
	nargs="*",
	help="Paths to sample images to test. If none, print usage.",
	)
	args = parser.parse_args()

	if not args.images:
	print("Usage: python script_detection.py <image1> [image2 ...]", file=sys.stderr)
	print("Example: python script_detection.py doc.png", file=sys.stderr)
	sys.exit(0)

	for path in args.images:
	if not path.is_file():
	print(f"Skip (not found): {path}", file=sys.stderr)
	continue
	result = detect_script(path)
	print(f"{path.name}: script={result['script']}, confidence={result['confidence']}")


	if __name__ == "__main__":
	_main()