""" Post-processing for Urdu OCR pipeline. Takes YOLOv8 box coordinates and per-crop OCR strings, then: - Sorts boxes in Urdu reading order (top-to-bottom, right-to-left) - Groups boxes into lines by vertical proximity - Concatenates text per line in RTL order - Cleans output (duplicate chars, non-Urdu ASCII noise; preserves Urdu punctuation) No retraining, no external NLP. Python only. """ from __future__ import annotations import re from typing import List, Sequence, Tuple, Union # --------------------------------------------------------------------------- # RTL and line grouping # --------------------------------------------------------------------------- # Vertical distance (in same units as box y coords) below which two boxes # are considered on the same line. Tune for your layout/image scale. DEFAULT_VERTICAL_THRESHOLD = 20 def _box_center(box: Tuple[float, float, float, float]) -> Tuple[float, float]: """(x1, y1, x2, y2) -> (center_x, center_y).""" x1, y1, x2, y2 = box return ((x1 + x2) / 2, (y1 + y2) / 2) def _sort_boxes_reading_order_rtl( boxes: List[Tuple[float, float, float, float]], texts: List[str], vertical_threshold: float, ) -> List[Tuple[Tuple[float, float, float, float], str]]: """ Sort boxes in Urdu reading order: top-to-bottom, then right-to-left within each line. RTL: Urdu is read from right to left, so the rightmost box on a line comes first. We group by line (similar y), then within each line sort by x descending (larger x = right). Primary sort by y (top to bottom); secondary sort by -center_x (right to left). """ if not boxes or len(boxes) != len(texts): return list(zip(boxes, texts)) # Pair and get centers paired = [(b, t) for b, t in zip(boxes, texts)] with_y = [(_box_center(b)[1], _box_center(b)[0], b, t) for b, t in paired] # Sort by y first (top to bottom) with_y.sort(key=lambda x: x[0]) # Group into lines: consecutive boxes whose y-centers are within vertical_threshold lines: List[List[Tuple[Tuple[float, float, float, float], str]]] = [] for _, _, b, t in with_y: cy = _box_center(b)[1] if not lines: lines.append([(b, t)]) continue last_line = lines[-1] # Compare to representative y of current line (e.g. first box's y) ref_y = _box_center(last_line[0][0])[1] if abs(cy - ref_y) <= vertical_threshold: last_line.append((b, t)) else: lines.append([(b, t)]) # Within each line: sort right-to-left (descending x). Urdu is RTL: rightmost # box on a line is read first, so we order by -center_x (largest x first). out: List[Tuple[Tuple[float, float, float, float], str]] = [] for line in lines: line.sort(key=lambda x: -_box_center(x[0])[0]) out.extend(line) return out def _clean_urdu_text(line: str) -> str: """ Clean one line of OCR output for Urdu. - Remove consecutive duplicate characters (e.g. ااا -> ا). - Remove non-Urdu ASCII noise (control chars, Latin, etc.); keep spaces. - Preserve Urdu/Arabic script and common punctuation (، ؛ ؟ . etc.). """ if not line: return line # Remove duplicate consecutive characters dedup = [] for c in line: if dedup and c == dedup[-1]: continue dedup.append(c) s = "".join(dedup) # Keep: Arabic block (U+0600–U+06FF), Arabic Supplement (U+0750–U+077F), # Arabic Extended-A (U+08A0–U+08FF), space, ZWNJ, ZWJ, and common punctuation # Strip non-Urdu ASCII (Latin, digits if you want to drop them, etc.) kept = [] for c in s: code = ord(c) if c.isspace() or c in "\u200c\u200d": # ZWNJ, ZWJ kept.append(c) elif 0x0600 <= code <= 0x06FF: # Arabic kept.append(c) elif 0x0750 <= code <= 0x077F: # Arabic Supplement kept.append(c) elif 0x08A0 <= code <= 0x08FF: # Arabic Extended-A kept.append(c) elif c in "،؛؟.۔!\"'()-–—": # Common punctuation (Urdu/Arabic + neutral) kept.append(c) # Skip ASCII and other noise (Latin, control, etc.) cleaned = "".join(kept) # Normalize multiple spaces to one cleaned = re.sub(r"\s+", " ", cleaned) return cleaned.strip() def post_process( boxes: Sequence[Union[Tuple[float, float, float, float], Sequence[float]]], texts: Sequence[str], *, vertical_threshold: float = DEFAULT_VERTICAL_THRESHOLD, ) -> str: """ Post-process detected boxes and OCR strings into clean, multiline Urdu text in RTL order. Steps: 1. Sort boxes in reading order: top-to-bottom (y), then right-to-left (x) within each line. 2. Group boxes into lines using vertical_threshold (same line if y-distance <= threshold). 3. Within each line, concatenate recognized text in RTL order (rightmost box first). 4. Clean each line: remove duplicate consecutive chars and non-Urdu ASCII; keep Urdu punctuation. 5. Return a single string with one line per visual line, suitable for RTL display. Args: boxes: List of (x1, y1, x2, y2) in image coordinates (e.g. from YOLOv8 xyxy). texts: List of recognized Urdu strings, one per box; same order as boxes. vertical_threshold: Max y-distance to consider two boxes on the same line (same units as y). Returns: Single multiline string: each line is cleaned and in RTL order; lines separated by newline. """ if not boxes or not texts: return "" if len(boxes) != len(texts): texts = list(texts) + [""] * (len(boxes) - len(texts)) texts = texts[: len(boxes)] # Normalize to list of 4-tuples (x1, y1, x2, y2) box_tuples = [] for b in boxes: t = tuple(b)[:4] if hasattr(b, "__iter__") and not isinstance(b, str) else (0.0, 0.0, 0.0, 0.0) t = t + (0.0,) * (4 - len(t)) box_tuples.append((float(t[0]), float(t[1]), float(t[2]), float(t[3]))) text_list = list(texts)[: len(box_tuples)] # Sort in Urdu reading order (top-to-bottom, then right-to-left per line) sorted_pairs = _sort_boxes_reading_order_rtl( box_tuples, text_list, vertical_threshold ) # Group again by line (same grouping as in sort) and concatenate text per line lines_text: List[str] = [] current_line: List[str] = [] current_y: float | None = None for box, text in sorted_pairs: cy = _box_center(box)[1] if current_y is None: current_y = cy current_line = [text] continue if abs(cy - current_y) <= vertical_threshold: current_line.append(text) else: # New line: emit previous line (already in RTL order: rightmost first) line_str = " ".join(current_line) lines_text.append(_clean_urdu_text(line_str)) current_line = [text] current_y = cy if current_line: line_str = " ".join(current_line) lines_text.append(_clean_urdu_text(line_str)) return "\n".join(lines_text)