from typing import List import numpy as np from shapely.geometry import Polygon import math import copy from utils.imgproc_utils import union_area, xywh2xyxypoly, rotate_polygons import cv2 LANG_LIST = ['eng', 'ja', 'unknown'] LANGCLS2IDX = {'eng': 0, 'ja': 1, 'unknown': 2} class TextBlock(object): def __init__(self, xyxy: List, lines: List = None, language: str = 'unknown', vertical: bool = False, font_size: float = -1, distance: List = None, angle: int = 0, vec: List = None, norm: float = -1, merged: bool = False, weight: float = -1, text: List = None, translation: str = "", fg_r = 0, fg_g = 0, fg_b = 0, bg_r = 0, bg_g = 0, bg_b = 0, line_spacing = 1., font_family: str = "", bold: bool = False, underline: bool = False, italic: bool = False, alignment: int = -1, alpha: float = 255, rich_text: str = "", _bounding_rect: List = None, accumulate_color = True, default_stroke_width = 0.2, target_lang: str = "", **kwargs) -> None: self.xyxy = [int(num) for num in xyxy] # boundingbox of textblock self.lines = [] if lines is None else lines # polygons of textlines self.vertical = vertical # orientation of textlines self.language = language self.font_size = font_size # font pixel size self.distance = None if distance is None else np.array(distance, np.float64) # distance between textlines and "origin" self.angle = angle # rotation angle of textlines self.vec = None if vec is None else np.array(vec, np.float64) # primary vector of textblock self.norm = norm # primary norm of textblock self.merged = merged self.weight = weight self.text = text if text is not None else [] self.prob = 1 self.translation = translation # note they're accumulative rgb values of textlines self.fg_r = fg_r self.fg_g = fg_g self.fg_b = fg_b self.bg_r = bg_r self.bg_g = bg_g self.bg_b = bg_b # self.stroke_width = stroke_width self.font_family: str = font_family self.bold: bool = bold self.underline: bool = underline self.italic: bool = italic self.alpha = alpha self.rich_text = rich_text self.line_spacing = line_spacing # self.alignment = alignment self._alignment = alignment self._target_lang = target_lang self._bounding_rect = _bounding_rect self.default_stroke_width = default_stroke_width self.accumulate_color = accumulate_color def adjust_bbox(self, with_bbox=False): lines = self.lines_array().astype(np.int32) if with_bbox: self.xyxy[0] = min(lines[..., 0].min(), self.xyxy[0]) self.xyxy[1] = min(lines[..., 1].min(), self.xyxy[1]) self.xyxy[2] = max(lines[..., 0].max(), self.xyxy[2]) self.xyxy[3] = max(lines[..., 1].max(), self.xyxy[3]) else: self.xyxy[0] = lines[..., 0].min() self.xyxy[1] = lines[..., 1].min() self.xyxy[2] = lines[..., 0].max() self.xyxy[3] = lines[..., 1].max() def sort_lines(self): if self.distance is not None: idx = np.argsort(self.distance) self.distance = self.distance[idx] lines = np.array(self.lines, dtype=np.int32) self.lines = lines[idx].tolist() def lines_array(self, dtype=np.float64): return np.array(self.lines, dtype=dtype) def aspect_ratio(self) -> float: min_rect = self.min_rect() middle_pnts = (min_rect[:, [1, 2, 3, 0]] + min_rect) / 2 norm_v = np.linalg.norm(middle_pnts[:, 2] - middle_pnts[:, 0]) norm_h = np.linalg.norm(middle_pnts[:, 1] - middle_pnts[:, 3]) return norm_v / norm_h def center(self): xyxy = np.array(self.xyxy) return (xyxy[:2] + xyxy[2:]) / 2 def min_rect(self, rotate_back=True): angled = self.angle != 0 center = self.center() polygons = self.lines_array().reshape(-1, 8) if angled: polygons = rotate_polygons(center, polygons, self.angle) min_x = polygons[:, ::2].min() min_y = polygons[:, 1::2].min() max_x = polygons[:, ::2].max() max_y = polygons[:, 1::2].max() min_bbox = np.array([[min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y]]) if angled and rotate_back: min_bbox = rotate_polygons(center, min_bbox, -self.angle) return min_bbox.reshape(-1, 4, 2).astype(np.int64) # equivalent to qt's boundingRect, ignore angle def bounding_rect(self): if self._bounding_rect is None: # if True: min_bbox = self.min_rect(rotate_back=False)[0] x, y = min_bbox[0] w, h = min_bbox[2] - min_bbox[0] return [x, y, w, h] return self._bounding_rect def __getattribute__(self, name: str): if name == 'pts': return self.lines_array() # else: return object.__getattribute__(self, name) def __len__(self): return len(self.lines) def __getitem__(self, idx): return self.lines[idx] def to_dict(self): blk_dict = copy.deepcopy(vars(self)) return blk_dict def get_transformed_region(self, img, idx, textheight) -> np.ndarray : im_h, im_w = img.shape[:2] direction = 'v' if self.vertical else 'h' src_pts = np.array(self.lines[idx], dtype=np.float64) if self.language == 'eng' or (self.language == 'unknown' and not self.vertical): e_size = self.font_size / 3 src_pts[..., 0] += np.array([-e_size, e_size, e_size, -e_size]) src_pts[..., 1] += np.array([-e_size, -e_size, e_size, e_size]) src_pts[..., 0] = np.clip(src_pts[..., 0], 0, im_w) src_pts[..., 1] = np.clip(src_pts[..., 1], 0, im_h) middle_pnt = (src_pts[[1, 2, 3, 0]] + src_pts) / 2 vec_v = middle_pnt[2] - middle_pnt[0] # vertical vectors of textlines vec_h = middle_pnt[1] - middle_pnt[3] # horizontal vectors of textlines ratio = np.linalg.norm(vec_v) / np.linalg.norm(vec_h) if direction == 'h' : h = int(textheight) w = int(round(textheight / ratio)) dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32) M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0) region = cv2.warpPerspective(img, M, (w, h)) elif direction == 'v' : w = int(textheight) h = int(round(textheight * ratio)) dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32) M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0) region = cv2.warpPerspective(img, M, (w, h)) region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE) # cv2.imshow('region'+str(idx), region) # cv2.waitKey(0) return region def get_text(self): if isinstance(self.text, str): return self.text return ' '.join(self.text).strip() def set_font_colors(self, frgb, srgb, accumulate=True): self.accumulate_color = accumulate num_lines = len(self.lines) if accumulate and len(self.lines) > 0 else 1 # set font color frgb = np.array(frgb) * num_lines self.fg_r, self.fg_g, self.fg_b = frgb # set stroke color srgb = np.array(srgb) * num_lines self.bg_r, self.bg_g, self.bg_b = srgb def get_font_colors(self, bgr=False): num_lines = len(self.lines) frgb = np.array([self.fg_r, self.fg_g, self.fg_b]) brgb = np.array([self.bg_r, self.bg_g, self.bg_b]) if self.accumulate_color: if num_lines > 0: frgb = (frgb / num_lines).astype(np.int32) brgb = (brgb / num_lines).astype(np.int32) if bgr: return frgb[::-1], brgb[::-1] else: return frgb, brgb else: return [0, 0, 0], [0, 0, 0] else: return frgb, brgb def xywh(self): x, y, w, h = self.xyxy return [x, y, w-x, h-y] # alignleft: 0, center: 1, right: 2 def alignment(self): if self._alignment >= 0: return self._alignment elif self.vertical: return 0 lines = self.lines_array() if len(lines) == 1: return 0 angled = self.angle != 0 polygons = lines.reshape(-1, 8) if angled: polygons = rotate_polygons((0, 0), polygons, self.angle) polygons = polygons.reshape(-1, 4, 2) left_std = np.std(polygons[:, 0, 0]) # right_std = np.std(polygons[:, 1, 0]) center_std = np.std((polygons[:, 0, 0] + polygons[:, 1, 0]) / 2) if left_std < center_std: return 0 else: return 1 def target_lang(self): return self.target_lang @property def stroke_width(self): var = np.array([self.fg_r, self.fg_g, self.fg_b]) \ - np.array([self.bg_r, self.bg_g, self.bg_b]) var = np.abs(var).sum() if var > 40: return self.default_stroke_width return 0 def sort_textblk_list(blk_list: List[TextBlock], im_w: int, im_h: int) -> List[TextBlock]: if len(blk_list) == 0: return blk_list num_ja = 0 xyxy = [] for blk in blk_list: if blk.language == 'ja': num_ja += 1 xyxy.append(blk.xyxy) xyxy = np.array(xyxy) flip_lr = num_ja > len(blk_list) / 2 im_oriw = im_w if im_w > im_h: im_w /= 2 num_gridy, num_gridx = 4, 3 img_area = im_h * im_w center_x = (xyxy[:, 0] + xyxy[:, 2]) / 2 if flip_lr: if im_w != im_oriw: center_x = im_oriw - center_x else: center_x = im_w - center_x grid_x = (center_x / im_w * num_gridx).astype(np.int32) center_y = (xyxy[:, 1] + xyxy[:, 3]) / 2 grid_y = (center_y / im_h * num_gridy).astype(np.int32) grid_indices = grid_y * num_gridx + grid_x grid_weights = grid_indices * img_area + 1.2 * (center_x - grid_x * im_w / num_gridx) + (center_y - grid_y * im_h / num_gridy) if im_w != im_oriw: grid_weights[np.where(grid_x >= num_gridx)] += img_area * num_gridy * num_gridx for blk, weight in zip(blk_list, grid_weights): blk.weight = weight blk_list.sort(key=lambda blk: blk.weight) return blk_list def examine_textblk(blk: TextBlock, im_w: int, im_h: int, sort: bool = False) -> None: lines = blk.lines_array() middle_pnts = (lines[:, [1, 2, 3, 0]] + lines) / 2 vec_v = middle_pnts[:, 2] - middle_pnts[:, 0] # vertical vectors of textlines vec_h = middle_pnts[:, 1] - middle_pnts[:, 3] # horizontal vectors of textlines # if sum of vertical vectors is longer, then text orientation is vertical, and vice versa. center_pnts = (lines[:, 0] + lines[:, 2]) / 2 v = np.sum(vec_v, axis=0) h = np.sum(vec_h, axis=0) norm_v, norm_h = np.linalg.norm(v), np.linalg.norm(h) if blk.language == 'ja': vertical = norm_v > norm_h else: vertical = norm_v > norm_h * 2 # calculate distance between textlines and origin if vertical: primary_vec, primary_norm = v, norm_v distance_vectors = center_pnts - np.array([[im_w, 0]], dtype=np.float64) # vertical manga text is read from right to left, so origin is (imw, 0) font_size = int(round(norm_h / len(lines))) else: primary_vec, primary_norm = h, norm_h distance_vectors = center_pnts - np.array([[0, 0]], dtype=np.float64) font_size = int(round(norm_v / len(lines))) rotation_angle = int(math.atan2(primary_vec[1], primary_vec[0]) / math.pi * 180) # rotation angle of textlines distance = np.linalg.norm(distance_vectors, axis=1) # distance between textlinecenters and origin rad_matrix = np.arccos(np.einsum('ij, j->i', distance_vectors, primary_vec) / (distance * primary_norm)) distance = np.abs(np.sin(rad_matrix) * distance) blk.lines = lines.astype(np.int32).tolist() blk.distance = distance blk.angle = rotation_angle if vertical: blk.angle -= 90 if abs(blk.angle) < 3: blk.angle = 0 blk.font_size = font_size blk.vertical = vertical blk.vec = primary_vec blk.norm = primary_norm if sort: blk.sort_lines() def try_merge_textline(blk: TextBlock, blk2: TextBlock, fntsize_tol=1.3, distance_tol=2) -> bool: if blk2.merged: return False fntsize_div = blk.font_size / blk2.font_size num_l1, num_l2 = len(blk), len(blk2) fntsz_avg = (blk.font_size * num_l1 + blk2.font_size * num_l2) / (num_l1 + num_l2) vec_prod = blk.vec @ blk2.vec vec_sum = blk.vec + blk2.vec cos_vec = vec_prod / blk.norm / blk2.norm distance = blk2.distance[-1] - blk.distance[-1] distance_p1 = np.linalg.norm(np.array(blk2.lines[-1][0]) - np.array(blk.lines[-1][0])) l1, l2 = Polygon(blk.lines[-1]), Polygon(blk2.lines[-1]) if not l1.intersects(l2): if fntsize_div > fntsize_tol or 1 / fntsize_div > fntsize_tol: return False if abs(cos_vec) < 0.866: # cos30 return False if distance > distance_tol * fntsz_avg or distance_p1 > fntsz_avg * 2.5: return False # merge blk.lines.append(blk2.lines[0]) blk.vec = vec_sum blk.angle = int(round(np.rad2deg(math.atan2(vec_sum[1], vec_sum[0])))) if blk.vertical: blk.angle -= 90 blk.norm = np.linalg.norm(vec_sum) blk.distance = np.append(blk.distance, blk2.distance[-1]) blk.font_size = fntsz_avg blk2.merged = True return True def merge_textlines(blk_list: List[TextBlock]) -> List[TextBlock]: if len(blk_list) < 2: return blk_list blk_list.sort(key=lambda blk: blk.distance[0]) merged_list = [] for ii, current_blk in enumerate(blk_list): if current_blk.merged: continue for jj, blk in enumerate(blk_list[ii+1:]): try_merge_textline(current_blk, blk) merged_list.append(current_blk) for blk in merged_list: blk.adjust_bbox(with_bbox=False) return merged_list def split_textblk(blk: TextBlock): font_size, distance, lines = blk.font_size, blk.distance, blk.lines l0 = np.array(blk.lines[0]) lines.sort(key=lambda line: np.linalg.norm(np.array(line[0]) - l0[0])) distance_tol = font_size * 2 current_blk = copy.deepcopy(blk) current_blk.lines = [l0] sub_blk_list = [current_blk] textblock_splitted = False for jj, line in enumerate(lines[1:]): l1, l2 = Polygon(lines[jj]), Polygon(line) split = False if not l1.intersects(l2): line_disance = abs(distance[jj+1] - distance[jj]) if line_disance > distance_tol: split = True elif blk.vertical and abs(blk.angle) < 15: if len(current_blk.lines) > 1 or line_disance > font_size: split = abs(lines[jj][0][1] - line[0][1]) > font_size if split: current_blk = copy.deepcopy(current_blk) current_blk.lines = [line] sub_blk_list.append(current_blk) else: current_blk.lines.append(line) if len(sub_blk_list) > 1: textblock_splitted = True for current_blk in sub_blk_list: current_blk.adjust_bbox(with_bbox=False) return textblock_splitted, sub_blk_list def group_output(blks, lines, im_w, im_h, mask=None, sort_blklist=True) -> List[TextBlock]: blk_list: List[TextBlock] = [] scattered_lines = {'ver': [], 'hor': []} for bbox, cls, conf in zip(*blks): # cls could give wrong result blk_list.append(TextBlock(bbox, language=LANG_LIST[cls])) # step1: filter & assign lines to textblocks bbox_score_thresh = 0.4 mask_score_thresh = 0.1 for ii, line in enumerate(lines): bx1, bx2 = line[:, 0].min(), line[:, 0].max() by1, by2 = line[:, 1].min(), line[:, 1].max() bbox_score, bbox_idx = -1, -1 line_area = (by2-by1) * (bx2-bx1) for jj, blk in enumerate(blk_list): score = union_area(blk.xyxy, [bx1, by1, bx2, by2]) / line_area if bbox_score < score: bbox_score = score bbox_idx = jj if bbox_score > bbox_score_thresh: blk_list[bbox_idx].lines.append(line) else: # if no textblock was assigned, check whether there is "enough" textmask if mask is not None: mask_score = mask[by1: by2, bx1: bx2].mean() / 255 if mask_score < mask_score_thresh: continue blk = TextBlock([bx1, by1, bx2, by2], [line]) examine_textblk(blk, im_w, im_h, sort=False) if blk.vertical: scattered_lines['ver'].append(blk) else: scattered_lines['hor'].append(blk) # step2: filter textblocks, sort & split textlines final_blk_list = [] for blk in blk_list: # filter textblocks if len(blk.lines) == 0: bx1, by1, bx2, by2 = blk.xyxy if mask is not None: mask_score = mask[by1: by2, bx1: bx2].mean() / 255 if mask_score < mask_score_thresh: continue xywh = np.array([[bx1, by1, bx2-bx1, by2-by1]]) blk.lines = xywh2xyxypoly(xywh).reshape(-1, 4, 2).tolist() examine_textblk(blk, im_w, im_h, sort=True) # split manga text if there is a distance gap textblock_splitted = False if len(blk.lines) > 1: if blk.language == 'ja': textblock_splitted = True elif blk.vertical: textblock_splitted = True if textblock_splitted: textblock_splitted, sub_blk_list = split_textblk(blk) else: sub_blk_list = [blk] # modify textblock to fit its textlines if not textblock_splitted: for blk in sub_blk_list: blk.adjust_bbox(with_bbox=True) final_blk_list += sub_blk_list # step3: merge scattered lines, sort textblocks by "grid" final_blk_list += merge_textlines(scattered_lines['hor']) final_blk_list += merge_textlines(scattered_lines['ver']) if sort_blklist: final_blk_list = sort_textblk_list(final_blk_list, im_w, im_h) for blk in final_blk_list: if blk.language == 'eng' and not blk.vertical: num_lines = len(blk.lines) if num_lines == 0: continue # blk.line_spacing = blk.bounding_rect()[3] / num_lines / blk.font_size expand_size = max(int(blk.font_size * 0.1), 2) rad = np.deg2rad(blk.angle) shifted_vec = np.array([[[-1, -1],[1, -1],[1, 1],[-1, 1]]]) shifted_vec = shifted_vec * np.array([[[np.sin(rad), np.cos(rad)]]]) * expand_size lines = blk.lines_array() + shifted_vec lines[..., 0] = np.clip(lines[..., 0], 0, im_w-1) lines[..., 1] = np.clip(lines[..., 1], 0, im_h-1) blk.lines = lines.astype(np.int64).tolist() blk.font_size += expand_size return final_blk_list def visualize_textblocks(canvas, blk_list: List[TextBlock], path = '../output/'): lw = max(round(sum(canvas.shape) / 2 * 0.003), 2) # line width for ii, blk in enumerate(blk_list): bx1, by1, bx2, by2 = blk.xyxy cv2.rectangle(canvas, (bx1, by1), (bx2, by2), (127, 255, 127), lw) cut_img = canvas[by1:by2, bx1:bx2] cv2.imwrite(path + f'/cut_image_{ii}.png', cut_img) lines = blk.lines_array(dtype=np.int32) for jj, line in enumerate(lines): cv2.putText(canvas, str(jj), line[0], cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,127,0), 1) cv2.polylines(canvas, [line], True, (0,127,255), 2) cv2.polylines(canvas, [blk.min_rect()], True, (127,127,0), 2) center = [int((bx1 + bx2)/2), int((by1 + by2)/2)] cv2.putText(canvas, str(blk.angle), center, cv2.FONT_HERSHEY_SIMPLEX, 1, (127,127,255), 2) cv2.putText(canvas, str(ii), (bx1, by1 + lw + 2), 0, lw / 3, (255,127,127), max(lw-1, 1), cv2.LINE_AA) return canvas