Spaces:
Sleeping
Sleeping
File size: 7,152 Bytes
04f9475 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """
Post-processing for Urdu OCR pipeline.
Takes YOLOv8 box coordinates and per-crop OCR strings, then:
- Sorts boxes in Urdu reading order (top-to-bottom, right-to-left)
- Groups boxes into lines by vertical proximity
- Concatenates text per line in RTL order
- Cleans output (duplicate chars, non-Urdu ASCII noise; preserves Urdu punctuation)
No retraining, no external NLP. Python only.
"""
from __future__ import annotations
import re
from typing import List, Sequence, Tuple, Union
# ---------------------------------------------------------------------------
# RTL and line grouping
# ---------------------------------------------------------------------------
# Vertical distance (in same units as box y coords) below which two boxes
# are considered on the same line. Tune for your layout/image scale.
DEFAULT_VERTICAL_THRESHOLD = 20
def _box_center(box: Tuple[float, float, float, float]) -> Tuple[float, float]:
"""(x1, y1, x2, y2) -> (center_x, center_y)."""
x1, y1, x2, y2 = box
return ((x1 + x2) / 2, (y1 + y2) / 2)
def _sort_boxes_reading_order_rtl(
boxes: List[Tuple[float, float, float, float]],
texts: List[str],
vertical_threshold: float,
) -> List[Tuple[Tuple[float, float, float, float], str]]:
"""
Sort boxes in Urdu reading order: top-to-bottom, then right-to-left within each line.
RTL: Urdu is read from right to left, so the rightmost box on a line comes first.
We group by line (similar y), then within each line sort by x descending (larger x = right).
Primary sort by y (top to bottom); secondary sort by -center_x (right to left).
"""
if not boxes or len(boxes) != len(texts):
return list(zip(boxes, texts))
# Pair and get centers
paired = [(b, t) for b, t in zip(boxes, texts)]
with_y = [(_box_center(b)[1], _box_center(b)[0], b, t) for b, t in paired]
# Sort by y first (top to bottom)
with_y.sort(key=lambda x: x[0])
# Group into lines: consecutive boxes whose y-centers are within vertical_threshold
lines: List[List[Tuple[Tuple[float, float, float, float], str]]] = []
for _, _, b, t in with_y:
cy = _box_center(b)[1]
if not lines:
lines.append([(b, t)])
continue
last_line = lines[-1]
# Compare to representative y of current line (e.g. first box's y)
ref_y = _box_center(last_line[0][0])[1]
if abs(cy - ref_y) <= vertical_threshold:
last_line.append((b, t))
else:
lines.append([(b, t)])
# Within each line: sort right-to-left (descending x). Urdu is RTL: rightmost
# box on a line is read first, so we order by -center_x (largest x first).
out: List[Tuple[Tuple[float, float, float, float], str]] = []
for line in lines:
line.sort(key=lambda x: -_box_center(x[0])[0])
out.extend(line)
return out
def _clean_urdu_text(line: str) -> str:
"""
Clean one line of OCR output for Urdu.
- Remove consecutive duplicate characters (e.g. ااا -> ا).
- Remove non-Urdu ASCII noise (control chars, Latin, etc.); keep spaces.
- Preserve Urdu/Arabic script and common punctuation (، ؛ ؟ . etc.).
"""
if not line:
return line
# Remove duplicate consecutive characters
dedup = []
for c in line:
if dedup and c == dedup[-1]:
continue
dedup.append(c)
s = "".join(dedup)
# Keep: Arabic block (U+0600–U+06FF), Arabic Supplement (U+0750–U+077F),
# Arabic Extended-A (U+08A0–U+08FF), space, ZWNJ, ZWJ, and common punctuation
# Strip non-Urdu ASCII (Latin, digits if you want to drop them, etc.)
kept = []
for c in s:
code = ord(c)
if c.isspace() or c in "\u200c\u200d": # ZWNJ, ZWJ
kept.append(c)
elif 0x0600 <= code <= 0x06FF: # Arabic
kept.append(c)
elif 0x0750 <= code <= 0x077F: # Arabic Supplement
kept.append(c)
elif 0x08A0 <= code <= 0x08FF: # Arabic Extended-A
kept.append(c)
elif c in "،؛؟.۔!\"'()-–—": # Common punctuation (Urdu/Arabic + neutral)
kept.append(c)
# Skip ASCII and other noise (Latin, control, etc.)
cleaned = "".join(kept)
# Normalize multiple spaces to one
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.strip()
def post_process(
boxes: Sequence[Union[Tuple[float, float, float, float], Sequence[float]]],
texts: Sequence[str],
*,
vertical_threshold: float = DEFAULT_VERTICAL_THRESHOLD,
) -> str:
"""
Post-process detected boxes and OCR strings into clean, multiline Urdu text in RTL order.
Steps:
1. Sort boxes in reading order: top-to-bottom (y), then right-to-left (x) within each line.
2. Group boxes into lines using vertical_threshold (same line if y-distance <= threshold).
3. Within each line, concatenate recognized text in RTL order (rightmost box first).
4. Clean each line: remove duplicate consecutive chars and non-Urdu ASCII; keep Urdu punctuation.
5. Return a single string with one line per visual line, suitable for RTL display.
Args:
boxes: List of (x1, y1, x2, y2) in image coordinates (e.g. from YOLOv8 xyxy).
texts: List of recognized Urdu strings, one per box; same order as boxes.
vertical_threshold: Max y-distance to consider two boxes on the same line (same units as y).
Returns:
Single multiline string: each line is cleaned and in RTL order; lines separated by newline.
"""
if not boxes or not texts:
return ""
if len(boxes) != len(texts):
texts = list(texts) + [""] * (len(boxes) - len(texts))
texts = texts[: len(boxes)]
# Normalize to list of 4-tuples (x1, y1, x2, y2)
box_tuples = []
for b in boxes:
t = tuple(b)[:4] if hasattr(b, "__iter__") and not isinstance(b, str) else (0.0, 0.0, 0.0, 0.0)
t = t + (0.0,) * (4 - len(t))
box_tuples.append((float(t[0]), float(t[1]), float(t[2]), float(t[3])))
text_list = list(texts)[: len(box_tuples)]
# Sort in Urdu reading order (top-to-bottom, then right-to-left per line)
sorted_pairs = _sort_boxes_reading_order_rtl(
box_tuples, text_list, vertical_threshold
)
# Group again by line (same grouping as in sort) and concatenate text per line
lines_text: List[str] = []
current_line: List[str] = []
current_y: float | None = None
for box, text in sorted_pairs:
cy = _box_center(box)[1]
if current_y is None:
current_y = cy
current_line = [text]
continue
if abs(cy - current_y) <= vertical_threshold:
current_line.append(text)
else:
# New line: emit previous line (already in RTL order: rightmost first)
line_str = " ".join(current_line)
lines_text.append(_clean_urdu_text(line_str))
current_line = [text]
current_y = cy
if current_line:
line_str = " ".join(current_line)
lines_text.append(_clean_urdu_text(line_str))
return "\n".join(lines_text)
|