trocr_htr / utils /image.py
Partha11's picture
feat(utils): update order_words method with configurable line grouping
5858087
import cv2
import numpy as np
from PIL import Image
class ImageUtils:
def resize(self, image, target_size, is_mask=False):
if is_mask and image.ndim == 3:
image = image[:, :, 0]
if image.ndim == 3:
h, w, c = image.shape
else:
h, w = image.shape
c = 1
max_dim = max(h, w)
scale = target_size / max_dim
new_w = int(w * scale)
new_h = int(h * scale)
interpolation = cv2.INTER_NEAREST if is_mask else cv2.INTER_LINEAR
resized = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
if is_mask:
canvas = np.zeros((target_size, target_size)) if c == 1 else \
np.zeros((target_size, target_size, c))
else:
canvas = np.full((target_size, target_size), 255, dtype=np.uint8) if c == 1 else \
np.full((target_size, target_size, c), 255, dtype=np.uint8)
top = (target_size - new_h) // 2
left = (target_size - new_w) // 2
if c > 1:
canvas[top:top+new_h, left:left+new_w, :] = resized
else:
canvas[top:top+new_h, left:left+new_w] = resized
return canvas, scale, top, left, (h, w)
def to_cv2(self, image):
return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
def normalize(self, image):
if image.ndim == 2:
image = image / 255.0
image = np.expand_dims(image, axis=-1)
else:
image = image / 255.0
return image
def darken_mask(self, mask):
return (mask * 255).astype(np.uint8)
def restore_size(self, mask, scale, top, left, orig_h, orig_w):
mask_unpadded = mask[top:top+int(orig_h*scale), left:left+int(orig_w*scale)]
mask_resized = cv2.resize(mask_unpadded.astype(np.uint8),
(orig_w, orig_h),
interpolation=cv2.INTER_NEAREST)
return mask_resized
def mask_image(self, image, mask):
mask = (mask * 255).astype(np.uint8)
return cv2.bitwise_and(image, image, mask=mask)
def extract_masks(self, image, mask):
words = []
unique_labels = np.unique(mask)
unique_labels = unique_labels[unique_labels != 0]
for label in unique_labels:
binary_mask = (mask == label).astype(np.uint8)
contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cropped_word = image[y:y+h, x:x+w]
words.append((cropped_word, (x, y, w, h)))
return words
def order_words(self, words, vertical_padding_factor=1.2, min_padding_px=2, height_mode='min'):
items = []
heights = []
for img, (x, y, w, h) in words:
cx = x + w / 2.0
cy = y + h / 2.0
items.append({'img': img, 'x': x, 'y': y, 'w': w, 'h': h, 'cx': cx, 'cy': cy})
heights.append(h)
heights = np.array(heights)
if len(heights) == 0:
return [], []
if height_mode == 'median':
base_h = float(np.median(heights))
else:
base_h = float(np.min(heights))
pad = max(int(round(base_h * vertical_padding_factor)), int(min_padding_px))
items_sorted = sorted(items, key=lambda it: it['cy'])
lines = []
current_line = [items_sorted[0]]
for it in items_sorted[1:]:
current_median = np.median([p['cy'] for p in current_line])
if abs(it['cy'] - current_median) <= pad:
current_line.append(it)
else:
lines.append(current_line)
current_line = [it]
if current_line:
lines.append(current_line)
ordered_items = []
for line in lines:
line.sort(key=lambda it: it['x'])
ordered_items.extend(line)
ordered_imgs = [it['img'] for it in ordered_items]
ordered_bboxes = [(int(it['x']), int(it['y']), int(it['w']), int(it['h'])) for it in ordered_items]
return ordered_imgs, ordered_bboxes