bilingual-ocr-api / post_process.py
Zarm33na's picture
Initial deployment: bilingual OCR API (Urdu + English)
04f9475
"""
Post-processing for Urdu OCR pipeline.
Takes YOLOv8 box coordinates and per-crop OCR strings, then:
- Sorts boxes in Urdu reading order (top-to-bottom, right-to-left)
- Groups boxes into lines by vertical proximity
- Concatenates text per line in RTL order
- Cleans output (duplicate chars, non-Urdu ASCII noise; preserves Urdu punctuation)
No retraining, no external NLP. Python only.
"""
from __future__ import annotations
import re
from typing import List, Sequence, Tuple, Union
# ---------------------------------------------------------------------------
# RTL and line grouping
# ---------------------------------------------------------------------------
# Vertical distance (in same units as box y coords) below which two boxes
# are considered on the same line. Tune for your layout/image scale.
DEFAULT_VERTICAL_THRESHOLD = 20
def _box_center(box: Tuple[float, float, float, float]) -> Tuple[float, float]:
"""(x1, y1, x2, y2) -> (center_x, center_y)."""
x1, y1, x2, y2 = box
return ((x1 + x2) / 2, (y1 + y2) / 2)
def _sort_boxes_reading_order_rtl(
boxes: List[Tuple[float, float, float, float]],
texts: List[str],
vertical_threshold: float,
) -> List[Tuple[Tuple[float, float, float, float], str]]:
"""
Sort boxes in Urdu reading order: top-to-bottom, then right-to-left within each line.
RTL: Urdu is read from right to left, so the rightmost box on a line comes first.
We group by line (similar y), then within each line sort by x descending (larger x = right).
Primary sort by y (top to bottom); secondary sort by -center_x (right to left).
"""
if not boxes or len(boxes) != len(texts):
return list(zip(boxes, texts))
# Pair and get centers
paired = [(b, t) for b, t in zip(boxes, texts)]
with_y = [(_box_center(b)[1], _box_center(b)[0], b, t) for b, t in paired]
# Sort by y first (top to bottom)
with_y.sort(key=lambda x: x[0])
# Group into lines: consecutive boxes whose y-centers are within vertical_threshold
lines: List[List[Tuple[Tuple[float, float, float, float], str]]] = []
for _, _, b, t in with_y:
cy = _box_center(b)[1]
if not lines:
lines.append([(b, t)])
continue
last_line = lines[-1]
# Compare to representative y of current line (e.g. first box's y)
ref_y = _box_center(last_line[0][0])[1]
if abs(cy - ref_y) <= vertical_threshold:
last_line.append((b, t))
else:
lines.append([(b, t)])
# Within each line: sort right-to-left (descending x). Urdu is RTL: rightmost
# box on a line is read first, so we order by -center_x (largest x first).
out: List[Tuple[Tuple[float, float, float, float], str]] = []
for line in lines:
line.sort(key=lambda x: -_box_center(x[0])[0])
out.extend(line)
return out
def _clean_urdu_text(line: str) -> str:
"""
Clean one line of OCR output for Urdu.
- Remove consecutive duplicate characters (e.g. ااا -> ا).
- Remove non-Urdu ASCII noise (control chars, Latin, etc.); keep spaces.
- Preserve Urdu/Arabic script and common punctuation (، ؛ ؟ . etc.).
"""
if not line:
return line
# Remove duplicate consecutive characters
dedup = []
for c in line:
if dedup and c == dedup[-1]:
continue
dedup.append(c)
s = "".join(dedup)
# Keep: Arabic block (U+0600–U+06FF), Arabic Supplement (U+0750–U+077F),
# Arabic Extended-A (U+08A0–U+08FF), space, ZWNJ, ZWJ, and common punctuation
# Strip non-Urdu ASCII (Latin, digits if you want to drop them, etc.)
kept = []
for c in s:
code = ord(c)
if c.isspace() or c in "\u200c\u200d": # ZWNJ, ZWJ
kept.append(c)
elif 0x0600 <= code <= 0x06FF: # Arabic
kept.append(c)
elif 0x0750 <= code <= 0x077F: # Arabic Supplement
kept.append(c)
elif 0x08A0 <= code <= 0x08FF: # Arabic Extended-A
kept.append(c)
elif c in "،؛؟.۔!\"'()-–—": # Common punctuation (Urdu/Arabic + neutral)
kept.append(c)
# Skip ASCII and other noise (Latin, control, etc.)
cleaned = "".join(kept)
# Normalize multiple spaces to one
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.strip()
def post_process(
boxes: Sequence[Union[Tuple[float, float, float, float], Sequence[float]]],
texts: Sequence[str],
*,
vertical_threshold: float = DEFAULT_VERTICAL_THRESHOLD,
) -> str:
"""
Post-process detected boxes and OCR strings into clean, multiline Urdu text in RTL order.
Steps:
1. Sort boxes in reading order: top-to-bottom (y), then right-to-left (x) within each line.
2. Group boxes into lines using vertical_threshold (same line if y-distance <= threshold).
3. Within each line, concatenate recognized text in RTL order (rightmost box first).
4. Clean each line: remove duplicate consecutive chars and non-Urdu ASCII; keep Urdu punctuation.
5. Return a single string with one line per visual line, suitable for RTL display.
Args:
boxes: List of (x1, y1, x2, y2) in image coordinates (e.g. from YOLOv8 xyxy).
texts: List of recognized Urdu strings, one per box; same order as boxes.
vertical_threshold: Max y-distance to consider two boxes on the same line (same units as y).
Returns:
Single multiline string: each line is cleaned and in RTL order; lines separated by newline.
"""
if not boxes or not texts:
return ""
if len(boxes) != len(texts):
texts = list(texts) + [""] * (len(boxes) - len(texts))
texts = texts[: len(boxes)]
# Normalize to list of 4-tuples (x1, y1, x2, y2)
box_tuples = []
for b in boxes:
t = tuple(b)[:4] if hasattr(b, "__iter__") and not isinstance(b, str) else (0.0, 0.0, 0.0, 0.0)
t = t + (0.0,) * (4 - len(t))
box_tuples.append((float(t[0]), float(t[1]), float(t[2]), float(t[3])))
text_list = list(texts)[: len(box_tuples)]
# Sort in Urdu reading order (top-to-bottom, then right-to-left per line)
sorted_pairs = _sort_boxes_reading_order_rtl(
box_tuples, text_list, vertical_threshold
)
# Group again by line (same grouping as in sort) and concatenate text per line
lines_text: List[str] = []
current_line: List[str] = []
current_y: float | None = None
for box, text in sorted_pairs:
cy = _box_center(box)[1]
if current_y is None:
current_y = cy
current_line = [text]
continue
if abs(cy - current_y) <= vertical_threshold:
current_line.append(text)
else:
# New line: emit previous line (already in RTL order: rightmost first)
line_str = " ".join(current_line)
lines_text.append(_clean_urdu_text(line_str))
current_line = [text]
current_y = cy
if current_line:
line_str = " ".join(current_line)
lines_text.append(_clean_urdu_text(line_str))
return "\n".join(lines_text)