Spaces:

III111II1I1
/

detector

Sleeping

detector / utils /textblock.py

II11ll

init

ac8579b 8 months ago

21.4 kB

	from typing import List
	import numpy as np
	from shapely.geometry import Polygon
	import math
	import copy
	from utils.imgproc_utils import union_area, xywh2xyxypoly, rotate_polygons
	import cv2

	LANG_LIST = ['eng', 'ja', 'unknown']
	LANGCLS2IDX = {'eng': 0, 'ja': 1, 'unknown': 2}

	class TextBlock(object):
	def __init__(self, xyxy: List,
	lines: List = None,
	language: str = 'unknown',
	vertical: bool = False,
	font_size: float = -1,
	distance: List = None,
	angle: int = 0,
	vec: List = None,
	norm: float = -1,
	merged: bool = False,
	weight: float = -1,
	text: List = None,
	translation: str = "",
	fg_r = 0,
	fg_g = 0,
	fg_b = 0,
	bg_r = 0,
	bg_g = 0,
	bg_b = 0,
	line_spacing = 1.,
	font_family: str = "",
	bold: bool = False,
	underline: bool = False,
	italic: bool = False,
	alignment: int = -1,
	alpha: float = 255,
	rich_text: str = "",
	_bounding_rect: List = None,
	accumulate_color = True,
	default_stroke_width = 0.2,
	target_lang: str = "",
	**kwargs) -> None:
	self.xyxy = [int(num) for num in xyxy] # boundingbox of textblock
	self.lines = [] if lines is None else lines # polygons of textlines
	self.vertical = vertical # orientation of textlines
	self.language = language
	self.font_size = font_size # font pixel size
	self.distance = None if distance is None else np.array(distance, np.float64) # distance between textlines and "origin"
	self.angle = angle # rotation angle of textlines

	self.vec = None if vec is None else np.array(vec, np.float64) # primary vector of textblock
	self.norm = norm # primary norm of textblock
	self.merged = merged
	self.weight = weight

	self.text = text if text is not None else []
	self.prob = 1

	self.translation = translation

	# note they're accumulative rgb values of textlines
	self.fg_r = fg_r
	self.fg_g = fg_g
	self.fg_b = fg_b
	self.bg_r = bg_r
	self.bg_g = bg_g
	self.bg_b = bg_b

	# self.stroke_width = stroke_width
	self.font_family: str = font_family
	self.bold: bool = bold
	self.underline: bool = underline
	self.italic: bool = italic
	self.alpha = alpha
	self.rich_text = rich_text
	self.line_spacing = line_spacing
	# self.alignment = alignment
	self._alignment = alignment
	self._target_lang = target_lang

	self._bounding_rect = _bounding_rect
	self.default_stroke_width = default_stroke_width
	self.accumulate_color = accumulate_color

	def adjust_bbox(self, with_bbox=False):
	lines = self.lines_array().astype(np.int32)
	if with_bbox:
	self.xyxy[0] = min(lines[..., 0].min(), self.xyxy[0])
	self.xyxy[1] = min(lines[..., 1].min(), self.xyxy[1])
	self.xyxy[2] = max(lines[..., 0].max(), self.xyxy[2])
	self.xyxy[3] = max(lines[..., 1].max(), self.xyxy[3])
	else:
	self.xyxy[0] = lines[..., 0].min()
	self.xyxy[1] = lines[..., 1].min()
	self.xyxy[2] = lines[..., 0].max()
	self.xyxy[3] = lines[..., 1].max()

	def sort_lines(self):
	if self.distance is not None:
	idx = np.argsort(self.distance)
	self.distance = self.distance[idx]
	lines = np.array(self.lines, dtype=np.int32)
	self.lines = lines[idx].tolist()

	def lines_array(self, dtype=np.float64):
	return np.array(self.lines, dtype=dtype)

	def aspect_ratio(self) -> float:
	min_rect = self.min_rect()
	middle_pnts = (min_rect[:, [1, 2, 3, 0]] + min_rect) / 2
	norm_v = np.linalg.norm(middle_pnts[:, 2] - middle_pnts[:, 0])
	norm_h = np.linalg.norm(middle_pnts[:, 1] - middle_pnts[:, 3])
	return norm_v / norm_h

	def center(self):
	xyxy = np.array(self.xyxy)
	return (xyxy[:2] + xyxy[2:]) / 2

	def min_rect(self, rotate_back=True):
	angled = self.angle != 0
	center = self.center()
	polygons = self.lines_array().reshape(-1, 8)
	if angled:
	polygons = rotate_polygons(center, polygons, self.angle)
	min_x = polygons[:, ::2].min()
	min_y = polygons[:, 1::2].min()
	max_x = polygons[:, ::2].max()
	max_y = polygons[:, 1::2].max()
	min_bbox = np.array([[min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y]])
	if angled and rotate_back:
	min_bbox = rotate_polygons(center, min_bbox, -self.angle)
	return min_bbox.reshape(-1, 4, 2).astype(np.int64)

	# equivalent to qt's boundingRect, ignore angle
	def bounding_rect(self):
	if self._bounding_rect is None:
	# if True:
	min_bbox = self.min_rect(rotate_back=False)[0]
	x, y = min_bbox[0]
	w, h = min_bbox[2] - min_bbox[0]
	return [x, y, w, h]
	return self._bounding_rect

	def __getattribute__(self, name: str):
	if name == 'pts':
	return self.lines_array()
	# else:
	return object.__getattribute__(self, name)

	def __len__(self):
	return len(self.lines)

	def __getitem__(self, idx):
	return self.lines[idx]

	def to_dict(self):
	blk_dict = copy.deepcopy(vars(self))
	return blk_dict

	def get_transformed_region(self, img, idx, textheight) -> np.ndarray :
	im_h, im_w = img.shape[:2]
	direction = 'v' if self.vertical else 'h'
	src_pts = np.array(self.lines[idx], dtype=np.float64)

	if self.language == 'eng' or (self.language == 'unknown' and not self.vertical):
	e_size = self.font_size / 3
	src_pts[..., 0] += np.array([-e_size, e_size, e_size, -e_size])
	src_pts[..., 1] += np.array([-e_size, -e_size, e_size, e_size])
	src_pts[..., 0] = np.clip(src_pts[..., 0], 0, im_w)
	src_pts[..., 1] = np.clip(src_pts[..., 1], 0, im_h)

	middle_pnt = (src_pts[[1, 2, 3, 0]] + src_pts) / 2
	vec_v = middle_pnt[2] - middle_pnt[0] # vertical vectors of textlines
	vec_h = middle_pnt[1] - middle_pnt[3] # horizontal vectors of textlines
	ratio = np.linalg.norm(vec_v) / np.linalg.norm(vec_h)

	if direction == 'h' :
	h = int(textheight)
	w = int(round(textheight / ratio))
	dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
	M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
	region = cv2.warpPerspective(img, M, (w, h))
	elif direction == 'v' :
	w = int(textheight)
	h = int(round(textheight * ratio))
	dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
	M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
	region = cv2.warpPerspective(img, M, (w, h))
	region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)
	# cv2.imshow('region'+str(idx), region)
	# cv2.waitKey(0)
	return region

	def get_text(self):
	if isinstance(self.text, str):
	return self.text
	return ' '.join(self.text).strip()

	def set_font_colors(self, frgb, srgb, accumulate=True):
	self.accumulate_color = accumulate
	num_lines = len(self.lines) if accumulate and len(self.lines) > 0 else 1
	# set font color
	frgb = np.array(frgb) * num_lines
	self.fg_r, self.fg_g, self.fg_b = frgb
	# set stroke color
	srgb = np.array(srgb) * num_lines
	self.bg_r, self.bg_g, self.bg_b = srgb

	def get_font_colors(self, bgr=False):
	num_lines = len(self.lines)
	frgb = np.array([self.fg_r, self.fg_g, self.fg_b])
	brgb = np.array([self.bg_r, self.bg_g, self.bg_b])
	if self.accumulate_color:
	if num_lines > 0:
	frgb = (frgb / num_lines).astype(np.int32)
	brgb = (brgb / num_lines).astype(np.int32)
	if bgr:
	return frgb[::-1], brgb[::-1]
	else:
	return frgb, brgb
	else:
	return [0, 0, 0], [0, 0, 0]
	else:
	return frgb, brgb

	def xywh(self):
	x, y, w, h = self.xyxy
	return [x, y, w-x, h-y]

	# alignleft: 0, center: 1, right: 2
	def alignment(self):
	if self._alignment >= 0:
	return self._alignment
	elif self.vertical:
	return 0
	lines = self.lines_array()
	if len(lines) == 1:
	return 0
	angled = self.angle != 0
	polygons = lines.reshape(-1, 8)
	if angled:
	polygons = rotate_polygons((0, 0), polygons, self.angle)
	polygons = polygons.reshape(-1, 4, 2)

	left_std = np.std(polygons[:, 0, 0])
	# right_std = np.std(polygons[:, 1, 0])
	center_std = np.std((polygons[:, 0, 0] + polygons[:, 1, 0]) / 2)
	if left_std < center_std:
	return 0
	else:
	return 1

	def target_lang(self):
	return self.target_lang

	@property
	def stroke_width(self):
	var = np.array([self.fg_r, self.fg_g, self.fg_b]) \
	- np.array([self.bg_r, self.bg_g, self.bg_b])
	var = np.abs(var).sum()
	if var > 40:
	return self.default_stroke_width
	return 0

	def sort_textblk_list(blk_list: List[TextBlock], im_w: int, im_h: int) -> List[TextBlock]:
	if len(blk_list) == 0:
	return blk_list
	num_ja = 0
	xyxy = []
	for blk in blk_list:
	if blk.language == 'ja':
	num_ja += 1
	xyxy.append(blk.xyxy)
	xyxy = np.array(xyxy)
	flip_lr = num_ja > len(blk_list) / 2
	im_oriw = im_w
	if im_w > im_h:
	im_w /= 2
	num_gridy, num_gridx = 4, 3
	img_area = im_h * im_w
	center_x = (xyxy[:, 0] + xyxy[:, 2]) / 2
	if flip_lr:
	if im_w != im_oriw:
	center_x = im_oriw - center_x
	else:
	center_x = im_w - center_x
	grid_x = (center_x / im_w * num_gridx).astype(np.int32)
	center_y = (xyxy[:, 1] + xyxy[:, 3]) / 2
	grid_y = (center_y / im_h * num_gridy).astype(np.int32)
	grid_indices = grid_y * num_gridx + grid_x
	grid_weights = grid_indices * img_area + 1.2 * (center_x - grid_x * im_w / num_gridx) + (center_y - grid_y * im_h / num_gridy)
	if im_w != im_oriw:
	grid_weights[np.where(grid_x >= num_gridx)] += img_area * num_gridy * num_gridx

	for blk, weight in zip(blk_list, grid_weights):
	blk.weight = weight
	blk_list.sort(key=lambda blk: blk.weight)
	return blk_list

	def examine_textblk(blk: TextBlock, im_w: int, im_h: int, sort: bool = False) -> None:
	lines = blk.lines_array()
	middle_pnts = (lines[:, [1, 2, 3, 0]] + lines) / 2
	vec_v = middle_pnts[:, 2] - middle_pnts[:, 0] # vertical vectors of textlines
	vec_h = middle_pnts[:, 1] - middle_pnts[:, 3] # horizontal vectors of textlines
	# if sum of vertical vectors is longer, then text orientation is vertical, and vice versa.
	center_pnts = (lines[:, 0] + lines[:, 2]) / 2
	v = np.sum(vec_v, axis=0)
	h = np.sum(vec_h, axis=0)
	norm_v, norm_h = np.linalg.norm(v), np.linalg.norm(h)
	if blk.language == 'ja':
	vertical = norm_v > norm_h
	else:
	vertical = norm_v > norm_h * 2
	# calculate distance between textlines and origin
	if vertical:
	primary_vec, primary_norm = v, norm_v
	distance_vectors = center_pnts - np.array([[im_w, 0]], dtype=np.float64) # vertical manga text is read from right to left, so origin is (imw, 0)
	font_size = int(round(norm_h / len(lines)))
	else:
	primary_vec, primary_norm = h, norm_h
	distance_vectors = center_pnts - np.array([[0, 0]], dtype=np.float64)
	font_size = int(round(norm_v / len(lines)))

	rotation_angle = int(math.atan2(primary_vec[1], primary_vec[0]) / math.pi * 180) # rotation angle of textlines
	distance = np.linalg.norm(distance_vectors, axis=1) # distance between textlinecenters and origin
	rad_matrix = np.arccos(np.einsum('ij, j->i', distance_vectors, primary_vec) / (distance * primary_norm))
	distance = np.abs(np.sin(rad_matrix) * distance)
	blk.lines = lines.astype(np.int32).tolist()
	blk.distance = distance
	blk.angle = rotation_angle
	if vertical:
	blk.angle -= 90
	if abs(blk.angle) < 3:
	blk.angle = 0
	blk.font_size = font_size
	blk.vertical = vertical
	blk.vec = primary_vec
	blk.norm = primary_norm
	if sort:
	blk.sort_lines()

	def try_merge_textline(blk: TextBlock, blk2: TextBlock, fntsize_tol=1.3, distance_tol=2) -> bool:
	if blk2.merged:
	return False
	fntsize_div = blk.font_size / blk2.font_size
	num_l1, num_l2 = len(blk), len(blk2)
	fntsz_avg = (blk.font_size * num_l1 + blk2.font_size * num_l2) / (num_l1 + num_l2)
	vec_prod = blk.vec @ blk2.vec
	vec_sum = blk.vec + blk2.vec
	cos_vec = vec_prod / blk.norm / blk2.norm
	distance = blk2.distance[-1] - blk.distance[-1]
	distance_p1 = np.linalg.norm(np.array(blk2.lines[-1][0]) - np.array(blk.lines[-1][0]))
	l1, l2 = Polygon(blk.lines[-1]), Polygon(blk2.lines[-1])
	if not l1.intersects(l2):
	if fntsize_div > fntsize_tol or 1 / fntsize_div > fntsize_tol:
	return False
	if abs(cos_vec) < 0.866: # cos30
	return False
	if distance > distance_tol * fntsz_avg or distance_p1 > fntsz_avg * 2.5:
	return False
	# merge
	blk.lines.append(blk2.lines[0])
	blk.vec = vec_sum
	blk.angle = int(round(np.rad2deg(math.atan2(vec_sum[1], vec_sum[0]))))
	if blk.vertical:
	blk.angle -= 90
	blk.norm = np.linalg.norm(vec_sum)
	blk.distance = np.append(blk.distance, blk2.distance[-1])
	blk.font_size = fntsz_avg
	blk2.merged = True
	return True

	def merge_textlines(blk_list: List[TextBlock]) -> List[TextBlock]:
	if len(blk_list) < 2:
	return blk_list
	blk_list.sort(key=lambda blk: blk.distance[0])
	merged_list = []
	for ii, current_blk in enumerate(blk_list):
	if current_blk.merged:
	continue
	for jj, blk in enumerate(blk_list[ii+1:]):
	try_merge_textline(current_blk, blk)
	merged_list.append(current_blk)
	for blk in merged_list:
	blk.adjust_bbox(with_bbox=False)
	return merged_list

	def split_textblk(blk: TextBlock):
	font_size, distance, lines = blk.font_size, blk.distance, blk.lines
	l0 = np.array(blk.lines[0])
	lines.sort(key=lambda line: np.linalg.norm(np.array(line[0]) - l0[0]))
	distance_tol = font_size * 2
	current_blk = copy.deepcopy(blk)
	current_blk.lines = [l0]
	sub_blk_list = [current_blk]
	textblock_splitted = False
	for jj, line in enumerate(lines[1:]):
	l1, l2 = Polygon(lines[jj]), Polygon(line)
	split = False
	if not l1.intersects(l2):
	line_disance = abs(distance[jj+1] - distance[jj])
	if line_disance > distance_tol:
	split = True
	elif blk.vertical and abs(blk.angle) < 15:
	if len(current_blk.lines) > 1 or line_disance > font_size:
	split = abs(lines[jj][0][1] - line[0][1]) > font_size
	if split:
	current_blk = copy.deepcopy(current_blk)
	current_blk.lines = [line]
	sub_blk_list.append(current_blk)
	else:
	current_blk.lines.append(line)
	if len(sub_blk_list) > 1:
	textblock_splitted = True
	for current_blk in sub_blk_list:
	current_blk.adjust_bbox(with_bbox=False)
	return textblock_splitted, sub_blk_list

	def group_output(blks, lines, im_w, im_h, mask=None, sort_blklist=True) -> List[TextBlock]:
	blk_list: List[TextBlock] = []
	scattered_lines = {'ver': [], 'hor': []}
	for bbox, cls, conf in zip(*blks):
	# cls could give wrong result
	blk_list.append(TextBlock(bbox, language=LANG_LIST[cls]))

	# step1: filter & assign lines to textblocks
	bbox_score_thresh = 0.4
	mask_score_thresh = 0.1
	for ii, line in enumerate(lines):
	bx1, bx2 = line[:, 0].min(), line[:, 0].max()
	by1, by2 = line[:, 1].min(), line[:, 1].max()
	bbox_score, bbox_idx = -1, -1
	line_area = (by2-by1) * (bx2-bx1)
	for jj, blk in enumerate(blk_list):
	score = union_area(blk.xyxy, [bx1, by1, bx2, by2]) / line_area
	if bbox_score < score:
	bbox_score = score
	bbox_idx = jj
	if bbox_score > bbox_score_thresh:
	blk_list[bbox_idx].lines.append(line)
	else: # if no textblock was assigned, check whether there is "enough" textmask
	if mask is not None:
	mask_score = mask[by1: by2, bx1: bx2].mean() / 255
	if mask_score < mask_score_thresh:
	continue
	blk = TextBlock([bx1, by1, bx2, by2], [line])
	examine_textblk(blk, im_w, im_h, sort=False)
	if blk.vertical:
	scattered_lines['ver'].append(blk)
	else:
	scattered_lines['hor'].append(blk)

	# step2: filter textblocks, sort & split textlines
	final_blk_list = []
	for blk in blk_list:
	# filter textblocks
	if len(blk.lines) == 0:
	bx1, by1, bx2, by2 = blk.xyxy
	if mask is not None:
	mask_score = mask[by1: by2, bx1: bx2].mean() / 255
	if mask_score < mask_score_thresh:
	continue
	xywh = np.array([[bx1, by1, bx2-bx1, by2-by1]])
	blk.lines = xywh2xyxypoly(xywh).reshape(-1, 4, 2).tolist()
	examine_textblk(blk, im_w, im_h, sort=True)

	# split manga text if there is a distance gap
	textblock_splitted = False
	if len(blk.lines) > 1:
	if blk.language == 'ja':
	textblock_splitted = True
	elif blk.vertical:
	textblock_splitted = True
	if textblock_splitted:
	textblock_splitted, sub_blk_list = split_textblk(blk)
	else:
	sub_blk_list = [blk]
	# modify textblock to fit its textlines
	if not textblock_splitted:
	for blk in sub_blk_list:
	blk.adjust_bbox(with_bbox=True)
	final_blk_list += sub_blk_list

	# step3: merge scattered lines, sort textblocks by "grid"
	final_blk_list += merge_textlines(scattered_lines['hor'])
	final_blk_list += merge_textlines(scattered_lines['ver'])
	if sort_blklist:
	final_blk_list = sort_textblk_list(final_blk_list, im_w, im_h)

	for blk in final_blk_list:
	if blk.language == 'eng' and not blk.vertical:
	num_lines = len(blk.lines)
	if num_lines == 0:
	continue
	# blk.line_spacing = blk.bounding_rect()[3] / num_lines / blk.font_size
	expand_size = max(int(blk.font_size * 0.1), 2)
	rad = np.deg2rad(blk.angle)
	shifted_vec = np.array([[[-1, -1],[1, -1],[1, 1],[-1, 1]]])
	shifted_vec = shifted_vec * np.array([[[np.sin(rad), np.cos(rad)]]]) * expand_size
	lines = blk.lines_array() + shifted_vec
	lines[..., 0] = np.clip(lines[..., 0], 0, im_w-1)
	lines[..., 1] = np.clip(lines[..., 1], 0, im_h-1)
	blk.lines = lines.astype(np.int64).tolist()
	blk.font_size += expand_size

	return final_blk_list

	def visualize_textblocks(canvas, blk_list: List[TextBlock], path = '../output/'):
	lw = max(round(sum(canvas.shape) / 2 * 0.003), 2) # line width
	for ii, blk in enumerate(blk_list):
	bx1, by1, bx2, by2 = blk.xyxy
	cv2.rectangle(canvas, (bx1, by1), (bx2, by2), (127, 255, 127), lw)
	cut_img = canvas[by1:by2, bx1:bx2]
	cv2.imwrite(path + f'/cut_image_{ii}.png', cut_img)
	lines = blk.lines_array(dtype=np.int32)
	for jj, line in enumerate(lines):
	cv2.putText(canvas, str(jj), line[0], cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,127,0), 1)
	cv2.polylines(canvas, [line], True, (0,127,255), 2)
	cv2.polylines(canvas, [blk.min_rect()], True, (127,127,0), 2)
	center = [int((bx1 + bx2)/2), int((by1 + by2)/2)]
	cv2.putText(canvas, str(blk.angle), center, cv2.FONT_HERSHEY_SIMPLEX, 1, (127,127,255), 2)
	cv2.putText(canvas, str(ii), (bx1, by1 + lw + 2), 0, lw / 3, (255,127,127), max(lw-1, 1), cv2.LINE_AA)
	return canvas