Spaces:

Partha11
/

trocr_htr

Sleeping

App Files Files Community

trocr_htr / utils /image.py

Partha11

feat(utils): update order_words method with configurable line grouping

5858087 6 months ago

raw

history blame contribute delete

4.32 kB

	import cv2
	import numpy as np
	from PIL import Image

	class ImageUtils:
	def resize(self, image, target_size, is_mask=False):
	if is_mask and image.ndim == 3:
	image = image[:, :, 0]

	if image.ndim == 3:
	h, w, c = image.shape
	else:
	h, w = image.shape
	c = 1

	max_dim = max(h, w)
	scale = target_size / max_dim
	new_w = int(w * scale)
	new_h = int(h * scale)

	interpolation = cv2.INTER_NEAREST if is_mask else cv2.INTER_LINEAR
	resized = cv2.resize(image, (new_w, new_h), interpolation=interpolation)

	if is_mask:
	canvas = np.zeros((target_size, target_size)) if c == 1 else \
	np.zeros((target_size, target_size, c))
	else:
	canvas = np.full((target_size, target_size), 255, dtype=np.uint8) if c == 1 else \
	np.full((target_size, target_size, c), 255, dtype=np.uint8)

	top = (target_size - new_h) // 2
	left = (target_size - new_w) // 2

	if c > 1:
	canvas[top:top+new_h, left:left+new_w, :] = resized
	else:
	canvas[top:top+new_h, left:left+new_w] = resized

	return canvas, scale, top, left, (h, w)

	def to_cv2(self, image):
	return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	def normalize(self, image):
	if image.ndim == 2:
	image = image / 255.0
	image = np.expand_dims(image, axis=-1)
	else:
	image = image / 255.0

	return image

	def darken_mask(self, mask):
	return (mask * 255).astype(np.uint8)

	def restore_size(self, mask, scale, top, left, orig_h, orig_w):
	mask_unpadded = mask[top:top+int(orig_hscale), left:left+int(orig_wscale)]
	mask_resized = cv2.resize(mask_unpadded.astype(np.uint8),
	(orig_w, orig_h),
	interpolation=cv2.INTER_NEAREST)

	return mask_resized

	def mask_image(self, image, mask):
	mask = (mask * 255).astype(np.uint8)
	return cv2.bitwise_and(image, image, mask=mask)

	def extract_masks(self, image, mask):
	words = []
	unique_labels = np.unique(mask)
	unique_labels = unique_labels[unique_labels != 0]

	for label in unique_labels:
	binary_mask = (mask == label).astype(np.uint8)
	contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)
	cropped_word = image[y:y+h, x:x+w]
	words.append((cropped_word, (x, y, w, h)))

	return words

	def order_words(self, words, vertical_padding_factor=1.2, min_padding_px=2, height_mode='min'):
	items = []
	heights = []

	for img, (x, y, w, h) in words:
	cx = x + w / 2.0
	cy = y + h / 2.0
	items.append({'img': img, 'x': x, 'y': y, 'w': w, 'h': h, 'cx': cx, 'cy': cy})
	heights.append(h)

	heights = np.array(heights)

	if len(heights) == 0:
	return [], []

	if height_mode == 'median':
	base_h = float(np.median(heights))
	else:
	base_h = float(np.min(heights))

	pad = max(int(round(base_h * vertical_padding_factor)), int(min_padding_px))

	items_sorted = sorted(items, key=lambda it: it['cy'])

	lines = []
	current_line = [items_sorted[0]]

	for it in items_sorted[1:]:
	current_median = np.median([p['cy'] for p in current_line])
	if abs(it['cy'] - current_median) <= pad:
	current_line.append(it)
	else:
	lines.append(current_line)
	current_line = [it]
	if current_line:
	lines.append(current_line)

	ordered_items = []

	for line in lines:
	line.sort(key=lambda it: it['x'])
	ordered_items.extend(line)

	ordered_imgs = [it['img'] for it in ordered_items]
	ordered_bboxes = [(int(it['x']), int(it['y']), int(it['w']), int(it['h'])) for it in ordered_items]

	return ordered_imgs, ordered_bboxes