import cv2 import numpy as np from PIL import Image class ImageUtils: def resize(self, image, target_size, is_mask=False): if is_mask and image.ndim == 3: image = image[:, :, 0] if image.ndim == 3: h, w, c = image.shape else: h, w = image.shape c = 1 max_dim = max(h, w) scale = target_size / max_dim new_w = int(w * scale) new_h = int(h * scale) interpolation = cv2.INTER_NEAREST if is_mask else cv2.INTER_LINEAR resized = cv2.resize(image, (new_w, new_h), interpolation=interpolation) if is_mask: canvas = np.zeros((target_size, target_size)) if c == 1 else \ np.zeros((target_size, target_size, c)) else: canvas = np.full((target_size, target_size), 255, dtype=np.uint8) if c == 1 else \ np.full((target_size, target_size, c), 255, dtype=np.uint8) top = (target_size - new_h) // 2 left = (target_size - new_w) // 2 if c > 1: canvas[top:top+new_h, left:left+new_w, :] = resized else: canvas[top:top+new_h, left:left+new_w] = resized return canvas, scale, top, left, (h, w) def to_cv2(self, image): return cv2.cvtColor(image, cv2.COLOR_BGR2RGB) def normalize(self, image): if image.ndim == 2: image = image / 255.0 image = np.expand_dims(image, axis=-1) else: image = image / 255.0 return image def darken_mask(self, mask): return (mask * 255).astype(np.uint8) def restore_size(self, mask, scale, top, left, orig_h, orig_w): mask_unpadded = mask[top:top+int(orig_h*scale), left:left+int(orig_w*scale)] mask_resized = cv2.resize(mask_unpadded.astype(np.uint8), (orig_w, orig_h), interpolation=cv2.INTER_NEAREST) return mask_resized def mask_image(self, image, mask): mask = (mask * 255).astype(np.uint8) return cv2.bitwise_and(image, image, mask=mask) def extract_masks(self, image, mask): words = [] unique_labels = np.unique(mask) unique_labels = unique_labels[unique_labels != 0] for label in unique_labels: binary_mask = (mask == label).astype(np.uint8) contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cropped_word = image[y:y+h, x:x+w] words.append((cropped_word, (x, y, w, h))) return words def order_words(self, words, vertical_padding_factor=1.2, min_padding_px=2, height_mode='min'): items = [] heights = [] for img, (x, y, w, h) in words: cx = x + w / 2.0 cy = y + h / 2.0 items.append({'img': img, 'x': x, 'y': y, 'w': w, 'h': h, 'cx': cx, 'cy': cy}) heights.append(h) heights = np.array(heights) if len(heights) == 0: return [], [] if height_mode == 'median': base_h = float(np.median(heights)) else: base_h = float(np.min(heights)) pad = max(int(round(base_h * vertical_padding_factor)), int(min_padding_px)) items_sorted = sorted(items, key=lambda it: it['cy']) lines = [] current_line = [items_sorted[0]] for it in items_sorted[1:]: current_median = np.median([p['cy'] for p in current_line]) if abs(it['cy'] - current_median) <= pad: current_line.append(it) else: lines.append(current_line) current_line = [it] if current_line: lines.append(current_line) ordered_items = [] for line in lines: line.sort(key=lambda it: it['x']) ordered_items.extend(line) ordered_imgs = [it['img'] for it in ordered_items] ordered_bboxes = [(int(it['x']), int(it['y']), int(it['w']), int(it['h'])) for it in ordered_items] return ordered_imgs, ordered_bboxes