Spaces:
Sleeping
Sleeping
| """ | |
| Script detection for OCR pipelines: Urdu vs English. | |
| Uses lightweight pytesseract OCR and Unicode range checks only. | |
| No ML models, no training. Explainable and FYP-safe. | |
| Reusable by future pipelines (bilingual routing, etc.). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| from typing import Literal | |
| # --------------------------------------------------------------------------- | |
| # Unicode ranges (explainable classification) | |
| # --------------------------------------------------------------------------- | |
| # Arabic block: used for Urdu and Arabic script (FYP-safe heuristic). | |
| ARABIC_START = 0x0600 | |
| ARABIC_END = 0x06FF | |
| def _has_arabic_script(text: str) -> bool: | |
| """ | |
| Return True if any character in text falls in the Arabic Unicode block (U+0600βU+06FF). | |
| Urdu uses the Arabic script, so presence of this range indicates Urdu (or Arabic). | |
| No ML; purely explainable Unicode check. | |
| """ | |
| if not text: | |
| return False | |
| for char in text: | |
| if ARABIC_START <= ord(char) <= ARABIC_END: | |
| return True | |
| return False | |
| def _tesseract_string(image_arg, *, lang: str) -> str: | |
| """Run Tesseract and return text; empty on any error.""" | |
| try: | |
| import pytesseract | |
| if isinstance(image_arg, (str, Path)): | |
| return pytesseract.image_to_string(str(image_arg), lang=lang) or "" | |
| from PIL import Image | |
| import numpy as np | |
| arr = image_arg | |
| if len(arr.shape) == 3: | |
| arr = arr[:, :, ::-1] | |
| pil = Image.fromarray(arr) | |
| return pytesseract.image_to_string(pil, lang=lang) or "" | |
| except Exception: | |
| return "" | |
| def _has_latin_letters(text: str) -> bool: | |
| """True if text has at least one Latin letter (A-Z, a-z).""" | |
| for c in text: | |
| if "a" <= c <= "z" or "A" <= c <= "Z": | |
| return True | |
| return False | |
| def detect_script_page(image: str | Path) -> Literal["urdu"] | None: | |
| """ | |
| Page-level script check: only treat page as "all Urdu" when page has Arabic and no clear English. | |
| Run Tesseract with lang="eng" first on the full page. If eng output has Latin letters β None | |
| (page is English or mixed; use per-crop detection). If eng is empty or has no Latin, run lang="ara"; | |
| if ara has Arabic β "urdu" (treat all crops on this page as Urdu). Else β None. | |
| This keeps all-English pages (e.g. QSL card) from being forced to Urdu when ara returns noise. | |
| """ | |
| if isinstance(image, (str, Path)) and not Path(image).is_file(): | |
| return None | |
| text_eng = _tesseract_string(image, lang="eng") | |
| if text_eng.strip() and _has_latin_letters(text_eng): | |
| return None | |
| text_ara = _tesseract_string(image, lang="ara") | |
| return "urdu" if _has_arabic_script(text_ara) else None | |
| def detect_script(image: str | Path | "np.ndarray") -> dict: | |
| """ | |
| Detect script (Urdu vs English): English only when eng output clearly has Latin text. | |
| 1. Run Tesseract with lang="eng" first. | |
| 2. If output has Arabic Unicode β "urdu". | |
| 3. If output has Latin letters (AβZ, aβz) and no Arabic β "english". | |
| 4. If output is empty or has no Latin letters (e.g. numbers only, or Urdu crop) β try lang="ara"; | |
| if ara has Arabic β "urdu", else "english". | |
| So: all-English pages get Latin from eng β English; Urdu crops get little/no Latin from eng, we try ara β Urdu. | |
| """ | |
| if isinstance(image, (str, Path)) and not Path(image).is_file(): | |
| return {"script": "english", "confidence": "heuristic"} | |
| text_eng = _tesseract_string(image, lang="eng") | |
| if _has_arabic_script(text_eng): | |
| return {"script": "urdu", "confidence": "heuristic"} | |
| if text_eng.strip() and _has_latin_letters(text_eng): | |
| return {"script": "english", "confidence": "heuristic"} | |
| text_ara = _tesseract_string(image, lang="ara") | |
| script: Literal["urdu", "english"] = "urdu" if _has_arabic_script(text_ara) else "english" | |
| return {"script": script, "confidence": "heuristic"} | |
| # --------------------------------------------------------------------------- | |
| # Main: test detection on sample images | |
| # --------------------------------------------------------------------------- | |
| def _main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Detect script (Urdu vs English) from images using pytesseract + Unicode checks.", | |
| ) | |
| parser.add_argument( | |
| "images", | |
| type=Path, | |
| nargs="*", | |
| help="Paths to sample images to test. If none, print usage.", | |
| ) | |
| args = parser.parse_args() | |
| if not args.images: | |
| print("Usage: python script_detection.py <image1> [image2 ...]", file=sys.stderr) | |
| print("Example: python script_detection.py doc.png", file=sys.stderr) | |
| sys.exit(0) | |
| for path in args.images: | |
| if not path.is_file(): | |
| print(f"Skip (not found): {path}", file=sys.stderr) | |
| continue | |
| result = detect_script(path) | |
| print(f"{path.name}: script={result['script']}, confidence={result['confidence']}") | |
| if __name__ == "__main__": | |
| _main() | |