Spaces:
Sleeping
Sleeping
File size: 5,116 Bytes
04f9475 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | """
Script detection for OCR pipelines: Urdu vs English.
Uses lightweight pytesseract OCR and Unicode range checks only.
No ML models, no training. Explainable and FYP-safe.
Reusable by future pipelines (bilingual routing, etc.).
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from typing import Literal
# ---------------------------------------------------------------------------
# Unicode ranges (explainable classification)
# ---------------------------------------------------------------------------
# Arabic block: used for Urdu and Arabic script (FYP-safe heuristic).
ARABIC_START = 0x0600
ARABIC_END = 0x06FF
def _has_arabic_script(text: str) -> bool:
"""
Return True if any character in text falls in the Arabic Unicode block (U+0600βU+06FF).
Urdu uses the Arabic script, so presence of this range indicates Urdu (or Arabic).
No ML; purely explainable Unicode check.
"""
if not text:
return False
for char in text:
if ARABIC_START <= ord(char) <= ARABIC_END:
return True
return False
def _tesseract_string(image_arg, *, lang: str) -> str:
"""Run Tesseract and return text; empty on any error."""
try:
import pytesseract
if isinstance(image_arg, (str, Path)):
return pytesseract.image_to_string(str(image_arg), lang=lang) or ""
from PIL import Image
import numpy as np
arr = image_arg
if len(arr.shape) == 3:
arr = arr[:, :, ::-1]
pil = Image.fromarray(arr)
return pytesseract.image_to_string(pil, lang=lang) or ""
except Exception:
return ""
def _has_latin_letters(text: str) -> bool:
"""True if text has at least one Latin letter (A-Z, a-z)."""
for c in text:
if "a" <= c <= "z" or "A" <= c <= "Z":
return True
return False
def detect_script_page(image: str | Path) -> Literal["urdu"] | None:
"""
Page-level script check: only treat page as "all Urdu" when page has Arabic and no clear English.
Run Tesseract with lang="eng" first on the full page. If eng output has Latin letters β None
(page is English or mixed; use per-crop detection). If eng is empty or has no Latin, run lang="ara";
if ara has Arabic β "urdu" (treat all crops on this page as Urdu). Else β None.
This keeps all-English pages (e.g. QSL card) from being forced to Urdu when ara returns noise.
"""
if isinstance(image, (str, Path)) and not Path(image).is_file():
return None
text_eng = _tesseract_string(image, lang="eng")
if text_eng.strip() and _has_latin_letters(text_eng):
return None
text_ara = _tesseract_string(image, lang="ara")
return "urdu" if _has_arabic_script(text_ara) else None
def detect_script(image: str | Path | "np.ndarray") -> dict:
"""
Detect script (Urdu vs English): English only when eng output clearly has Latin text.
1. Run Tesseract with lang="eng" first.
2. If output has Arabic Unicode β "urdu".
3. If output has Latin letters (AβZ, aβz) and no Arabic β "english".
4. If output is empty or has no Latin letters (e.g. numbers only, or Urdu crop) β try lang="ara";
if ara has Arabic β "urdu", else "english".
So: all-English pages get Latin from eng β English; Urdu crops get little/no Latin from eng, we try ara β Urdu.
"""
if isinstance(image, (str, Path)) and not Path(image).is_file():
return {"script": "english", "confidence": "heuristic"}
text_eng = _tesseract_string(image, lang="eng")
if _has_arabic_script(text_eng):
return {"script": "urdu", "confidence": "heuristic"}
if text_eng.strip() and _has_latin_letters(text_eng):
return {"script": "english", "confidence": "heuristic"}
text_ara = _tesseract_string(image, lang="ara")
script: Literal["urdu", "english"] = "urdu" if _has_arabic_script(text_ara) else "english"
return {"script": script, "confidence": "heuristic"}
# ---------------------------------------------------------------------------
# Main: test detection on sample images
# ---------------------------------------------------------------------------
def _main() -> None:
parser = argparse.ArgumentParser(
description="Detect script (Urdu vs English) from images using pytesseract + Unicode checks.",
)
parser.add_argument(
"images",
type=Path,
nargs="*",
help="Paths to sample images to test. If none, print usage.",
)
args = parser.parse_args()
if not args.images:
print("Usage: python script_detection.py <image1> [image2 ...]", file=sys.stderr)
print("Example: python script_detection.py doc.png", file=sys.stderr)
sys.exit(0)
for path in args.images:
if not path.is_file():
print(f"Skip (not found): {path}", file=sys.stderr)
continue
result = detect_script(path)
print(f"{path.name}: script={result['script']}, confidence={result['confidence']}")
if __name__ == "__main__":
_main()
|