Spaces:
Sleeping
Sleeping
Update utils/ocr_utils.py
Browse files- utils/ocr_utils.py +25 -43
utils/ocr_utils.py
CHANGED
|
@@ -5,54 +5,36 @@ from PIL import Image, ImageDraw, ImageFont
|
|
| 5 |
ocr_model = PaddleOCR(use_textline_orientation=True, lang='ch')
|
| 6 |
|
| 7 |
def group_nearby_boxes(lines, max_y_gap=50):
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
if not
|
| 18 |
continue
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
y_min, y_max = min(ys), max(ys)
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
"text": text,
|
| 27 |
-
"y_center": (y_min + y_max) / 2,
|
| 28 |
-
"y_min": y_min,
|
| 29 |
-
"y_max": y_max,
|
| 30 |
-
})
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
current_group.append(curr)
|
| 44 |
-
else:
|
| 45 |
-
groups.append({
|
| 46 |
-
"polygons": [b["polygon"] for b in current_group],
|
| 47 |
-
"texts": [b["text"] for b in current_group],
|
| 48 |
-
})
|
| 49 |
-
current_group = [curr]
|
| 50 |
-
|
| 51 |
-
if current_group:
|
| 52 |
-
groups.append({
|
| 53 |
-
"polygons": [b["polygon"] for b in current_group],
|
| 54 |
-
"texts": [b["text"] for b in current_group],
|
| 55 |
-
})
|
| 56 |
|
| 57 |
return groups
|
| 58 |
|
|
|
|
| 5 |
ocr_model = PaddleOCR(use_textline_orientation=True, lang='ch')
|
| 6 |
|
| 7 |
def group_nearby_boxes(lines, max_y_gap=50):
|
| 8 |
+
groups = []
|
| 9 |
+
used = set()
|
| 10 |
+
|
| 11 |
+
def is_valid_polygon(poly):
|
| 12 |
+
return isinstance(poly, (list, tuple)) and all(
|
| 13 |
+
isinstance(p, (list, tuple)) and len(p) == 2 for p in poly
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
for i, (poly_i, text_i) in enumerate(lines):
|
| 17 |
+
if i in used or not is_valid_polygon(poly_i):
|
| 18 |
continue
|
| 19 |
|
| 20 |
+
group = [(poly_i, text_i)]
|
| 21 |
+
used.add(i)
|
|
|
|
| 22 |
|
| 23 |
+
xi_min, yi_min = min(pt[1] for pt in poly_i), min(pt[0] for pt in poly_i)
|
| 24 |
+
xi_max, yi_max = max(pt[1] for pt in poly_i), max(pt[0] for pt in poly_i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
for j, (poly_j, text_j) in enumerate(lines):
|
| 27 |
+
if j in used or not is_valid_polygon(poly_j):
|
| 28 |
+
continue
|
| 29 |
|
| 30 |
+
xj_min, yj_min = min(pt[1] for pt in poly_j), min(pt[0] for pt in poly_j)
|
| 31 |
+
xj_max, yj_max = max(pt[1] for pt in poly_j), max(pt[0] for pt in poly_j)
|
| 32 |
+
|
| 33 |
+
if abs(yj_min - yi_min) < max_y_gap or abs(yj_max - yi_max) < max_y_gap:
|
| 34 |
+
group.append((poly_j, text_j))
|
| 35 |
+
used.add(j)
|
| 36 |
+
|
| 37 |
+
groups.append(group)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return groups
|
| 40 |
|