File size: 7,152 Bytes
04f9475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
Post-processing for Urdu OCR pipeline.

Takes YOLOv8 box coordinates and per-crop OCR strings, then:
- Sorts boxes in Urdu reading order (top-to-bottom, right-to-left)
- Groups boxes into lines by vertical proximity
- Concatenates text per line in RTL order
- Cleans output (duplicate chars, non-Urdu ASCII noise; preserves Urdu punctuation)

No retraining, no external NLP. Python only.
"""

from __future__ import annotations

import re
from typing import List, Sequence, Tuple, Union

# ---------------------------------------------------------------------------
# RTL and line grouping
# ---------------------------------------------------------------------------

# Vertical distance (in same units as box y coords) below which two boxes
# are considered on the same line. Tune for your layout/image scale.
DEFAULT_VERTICAL_THRESHOLD = 20


def _box_center(box: Tuple[float, float, float, float]) -> Tuple[float, float]:
    """(x1, y1, x2, y2) -> (center_x, center_y)."""
    x1, y1, x2, y2 = box
    return ((x1 + x2) / 2, (y1 + y2) / 2)


def _sort_boxes_reading_order_rtl(
    boxes: List[Tuple[float, float, float, float]],
    texts: List[str],
    vertical_threshold: float,
) -> List[Tuple[Tuple[float, float, float, float], str]]:
    """
    Sort boxes in Urdu reading order: top-to-bottom, then right-to-left within each line.

    RTL: Urdu is read from right to left, so the rightmost box on a line comes first.
    We group by line (similar y), then within each line sort by x descending (larger x = right).
    Primary sort by y (top to bottom); secondary sort by -center_x (right to left).
    """
    if not boxes or len(boxes) != len(texts):
        return list(zip(boxes, texts))

    # Pair and get centers
    paired = [(b, t) for b, t in zip(boxes, texts)]
    with_y = [(_box_center(b)[1], _box_center(b)[0], b, t) for b, t in paired]

    # Sort by y first (top to bottom)
    with_y.sort(key=lambda x: x[0])

    # Group into lines: consecutive boxes whose y-centers are within vertical_threshold
    lines: List[List[Tuple[Tuple[float, float, float, float], str]]] = []
    for _, _, b, t in with_y:
        cy = _box_center(b)[1]
        if not lines:
            lines.append([(b, t)])
            continue
        last_line = lines[-1]
        # Compare to representative y of current line (e.g. first box's y)
        ref_y = _box_center(last_line[0][0])[1]
        if abs(cy - ref_y) <= vertical_threshold:
            last_line.append((b, t))
        else:
            lines.append([(b, t)])

    # Within each line: sort right-to-left (descending x). Urdu is RTL: rightmost
    # box on a line is read first, so we order by -center_x (largest x first).
    out: List[Tuple[Tuple[float, float, float, float], str]] = []
    for line in lines:
        line.sort(key=lambda x: -_box_center(x[0])[0])
        out.extend(line)
    return out


def _clean_urdu_text(line: str) -> str:
    """
    Clean one line of OCR output for Urdu.

    - Remove consecutive duplicate characters (e.g. ااا -> ا).
    - Remove non-Urdu ASCII noise (control chars, Latin, etc.); keep spaces.
    - Preserve Urdu/Arabic script and common punctuation (، ؛ ؟ . etc.).
    """
    if not line:
        return line
    # Remove duplicate consecutive characters
    dedup = []
    for c in line:
        if dedup and c == dedup[-1]:
            continue
        dedup.append(c)
    s = "".join(dedup)
    # Keep: Arabic block (U+0600–U+06FF), Arabic Supplement (U+0750–U+077F),
    # Arabic Extended-A (U+08A0–U+08FF), space, ZWNJ, ZWJ, and common punctuation
    # Strip non-Urdu ASCII (Latin, digits if you want to drop them, etc.)
    kept = []
    for c in s:
        code = ord(c)
        if c.isspace() or c in "\u200c\u200d":  # ZWNJ, ZWJ
            kept.append(c)
        elif 0x0600 <= code <= 0x06FF:  # Arabic
            kept.append(c)
        elif 0x0750 <= code <= 0x077F:  # Arabic Supplement
            kept.append(c)
        elif 0x08A0 <= code <= 0x08FF:  # Arabic Extended-A
            kept.append(c)
        elif c in "،؛؟.۔!\"'()-–—":  # Common punctuation (Urdu/Arabic + neutral)
            kept.append(c)
        # Skip ASCII and other noise (Latin, control, etc.)
    cleaned = "".join(kept)
    # Normalize multiple spaces to one
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned.strip()


def post_process(
    boxes: Sequence[Union[Tuple[float, float, float, float], Sequence[float]]],
    texts: Sequence[str],
    *,
    vertical_threshold: float = DEFAULT_VERTICAL_THRESHOLD,
) -> str:
    """
    Post-process detected boxes and OCR strings into clean, multiline Urdu text in RTL order.

    Steps:
    1. Sort boxes in reading order: top-to-bottom (y), then right-to-left (x) within each line.
    2. Group boxes into lines using vertical_threshold (same line if y-distance <= threshold).
    3. Within each line, concatenate recognized text in RTL order (rightmost box first).
    4. Clean each line: remove duplicate consecutive chars and non-Urdu ASCII; keep Urdu punctuation.
    5. Return a single string with one line per visual line, suitable for RTL display.

    Args:
        boxes: List of (x1, y1, x2, y2) in image coordinates (e.g. from YOLOv8 xyxy).
        texts: List of recognized Urdu strings, one per box; same order as boxes.
        vertical_threshold: Max y-distance to consider two boxes on the same line (same units as y).

    Returns:
        Single multiline string: each line is cleaned and in RTL order; lines separated by newline.
    """
    if not boxes or not texts:
        return ""
    if len(boxes) != len(texts):
        texts = list(texts) + [""] * (len(boxes) - len(texts))
        texts = texts[: len(boxes)]

    # Normalize to list of 4-tuples (x1, y1, x2, y2)
    box_tuples = []
    for b in boxes:
        t = tuple(b)[:4] if hasattr(b, "__iter__") and not isinstance(b, str) else (0.0, 0.0, 0.0, 0.0)
        t = t + (0.0,) * (4 - len(t))
        box_tuples.append((float(t[0]), float(t[1]), float(t[2]), float(t[3])))
    text_list = list(texts)[: len(box_tuples)]

    # Sort in Urdu reading order (top-to-bottom, then right-to-left per line)
    sorted_pairs = _sort_boxes_reading_order_rtl(
        box_tuples, text_list, vertical_threshold
    )

    # Group again by line (same grouping as in sort) and concatenate text per line
    lines_text: List[str] = []
    current_line: List[str] = []
    current_y: float | None = None

    for box, text in sorted_pairs:
        cy = _box_center(box)[1]
        if current_y is None:
            current_y = cy
            current_line = [text]
            continue
        if abs(cy - current_y) <= vertical_threshold:
            current_line.append(text)
        else:
            # New line: emit previous line (already in RTL order: rightmost first)
            line_str = " ".join(current_line)
            lines_text.append(_clean_urdu_text(line_str))
            current_line = [text]
            current_y = cy

    if current_line:
        line_str = " ".join(current_line)
        lines_text.append(_clean_urdu_text(line_str))

    return "\n".join(lines_text)