# Copyright (C) 2021-2025, Mindee. # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. from copy import deepcopy from math import ceil import cv2 import numpy as np from .common_types import BoundingBox, Polygon4P __all__ = [ "bbox_to_polygon", "polygon_to_bbox", "resolve_enclosing_bbox", "resolve_enclosing_rbbox", "rotate_boxes", "compute_expanded_shape", "rotate_image", "remove_image_padding", "estimate_page_angle", "convert_to_relative_coords", "rotate_abs_geoms", "extract_crops", "extract_rcrops", "detach_scores", ] def bbox_to_polygon(bbox: BoundingBox) -> Polygon4P: """Convert a bounding box to a polygon Args: bbox: a bounding box Returns: a polygon """ return bbox[0], (bbox[1][0], bbox[0][1]), (bbox[0][0], bbox[1][1]), bbox[1] def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox: """Convert a polygon to a bounding box Args: polygon: a polygon Returns: a bounding box """ x, y = zip(*polygon) return (min(x), min(y)), (max(x), max(y)) def detach_scores(boxes: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]: """Detach the objectness scores from box predictions Args: boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2) Returns: a tuple of two lists: the first one contains the boxes without the objectness scores, the second one contains the objectness scores """ def _detach(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]: if boxes.ndim == 2: return boxes[:, :-1], boxes[:, -1] return boxes[:, :-1], boxes[:, -1, -1] loc_preds, obj_scores = zip(*(_detach(box) for box in boxes)) return list(loc_preds), list(obj_scores) def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBox | np.ndarray: """Compute enclosing bbox either from: Args: bboxes: boxes in one of the following formats: - an array of boxes: (*, 4), where boxes have this shape: (xmin, ymin, xmax, ymax) - a list of BoundingBox Returns: a (1, 4) array (enclosing boxarray), or a BoundingBox """ if isinstance(bboxes, np.ndarray): xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1) return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()]) else: x, y = zip(*[point for box in bboxes for point in box]) return (min(x), min(y)), (max(x), max(y)) def resolve_enclosing_rbbox(rbboxes: list[np.ndarray], intermed_size: int = 1024) -> np.ndarray: """Compute enclosing rotated bbox either from: Args: rbboxes: boxes in one of the following formats: - an array of boxes: (*, 4, 2), where boxes have this shape: (x1, y1), (x2, y2), (x3, y3), (x4, y4) - a list of BoundingBox intermed_size: size of the intermediate image Returns: a (4, 2) array (enclosing rotated box) """ cloud: np.ndarray = np.concatenate(rbboxes, axis=0) # Convert to absolute for minAreaRect cloud *= intermed_size rect = cv2.minAreaRect(cloud.astype(np.int32)) return cv2.boxPoints(rect) / intermed_size def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray: """Rotate points counter-clockwise. Args: points: array of size (N, 2) angle: angle between -90 and +90 degrees Returns: Rotated points """ angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions rotation_mat = np.array( [[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=points.dtype ) return np.matmul(points, rotation_mat.T) def compute_expanded_shape(img_shape: tuple[int, int], angle: float) -> tuple[int, int]: """Compute the shape of an expanded rotated image Args: img_shape: the height and width of the image angle: angle between -90 and +90 degrees Returns: the height and width of the rotated image """ points: np.ndarray = np.array([ [img_shape[1] / 2, img_shape[0] / 2], [-img_shape[1] / 2, img_shape[0] / 2], ]) rotated_points = rotate_abs_points(points, angle) wh_shape = 2 * np.abs(rotated_points).max(axis=0) return wh_shape[1], wh_shape[0] def rotate_abs_geoms( geoms: np.ndarray, angle: float, img_shape: tuple[int, int], expand: bool = True, ) -> np.ndarray: """Rotate a batch of bounding boxes or polygons by an angle around the image center. Args: geoms: (N, 4) or (N, 4, 2) array of ABSOLUTE coordinate boxes angle: anti-clockwise rotation angle in degrees img_shape: the height and width of the image expand: whether the image should be padded to avoid information loss Returns: A batch of rotated polygons (N, 4, 2) """ # Switch to polygons polys = ( np.stack([geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], axis=1) if geoms.ndim == 2 else geoms ) polys = polys.astype(np.float32) # Switch to image center as referential polys[..., 0] -= img_shape[1] / 2 polys[..., 1] = img_shape[0] / 2 - polys[..., 1] # Rotated them around image center rotated_polys = rotate_abs_points(polys.reshape(-1, 2), angle).reshape(-1, 4, 2) # Switch back to top-left corner as referential target_shape = compute_expanded_shape(img_shape, angle) if expand else img_shape # Clip coords to fit since there is no expansion rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(0, target_shape[1]) rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(0, target_shape[0]) return rotated_polys def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]) -> np.ndarray: """Remaps a batch of rotated locpred (N, 4, 2) expressed for an origin_shape to a destination_shape. This does not impact the absolute shape of the boxes, but allow to calculate the new relative RotatedBbox coordinates after a resizing of the image. Args: loc_preds: (N, 4, 2) array of RELATIVE loc_preds orig_shape: shape of the origin image dest_shape: shape of the destination image Returns: A batch of rotated loc_preds (N, 4, 2) expressed in the destination referencial """ if len(dest_shape) != 2: raise ValueError(f"Mask length should be 2, was found at: {len(dest_shape)}") if len(orig_shape) != 2: raise ValueError(f"Image_shape length should be 2, was found at: {len(orig_shape)}") orig_height, orig_width = orig_shape dest_height, dest_width = dest_shape mboxes = loc_preds.copy() mboxes[:, :, 0] = ((loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2) / dest_width mboxes[:, :, 1] = ((loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2) / dest_height return mboxes def rotate_boxes( loc_preds: np.ndarray, angle: float, orig_shape: tuple[int, int], min_angle: float = 1.0, target_shape: tuple[int, int] | None = None, ) -> np.ndarray: """Rotate a batch of straight bounding boxes (xmin, ymin, xmax, ymax, c) or rotated bounding boxes (4, 2) of an angle, if angle > min_angle, around the center of the page. If target_shape is specified, the boxes are remapped to the target shape after the rotation. This is done to remove the padding that is created by rotate_page(expand=True) Args: loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes angle: angle between -90 and +90 degrees orig_shape: shape of the origin image min_angle: minimum angle to rotate boxes target_shape: shape of the destination image Returns: A batch of rotated boxes (N, 4, 2): or a batch of straight bounding boxes """ # Change format of the boxes to rotated boxes _boxes = loc_preds.copy() if _boxes.ndim == 2: _boxes = np.stack( [ _boxes[:, [0, 1]], _boxes[:, [2, 1]], _boxes[:, [2, 3]], _boxes[:, [0, 3]], ], axis=1, ) # If small angle, return boxes (no rotation) if abs(angle) < min_angle or abs(angle) > 90 - min_angle: return _boxes # Compute rotation matrix angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions rotation_mat = np.array( [[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=_boxes.dtype ) # Rotate absolute points points: np.ndarray = np.stack((_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1) image_center = (orig_shape[1] / 2, orig_shape[0] / 2) rotated_points = image_center + np.matmul(points - image_center, rotation_mat) rotated_boxes: np.ndarray = np.stack( (rotated_points[:, :, 0] / orig_shape[1], rotated_points[:, :, 1] / orig_shape[0]), axis=-1 ) # Apply a mask if requested if target_shape is not None: rotated_boxes = remap_boxes(rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape) return rotated_boxes def rotate_image( image: np.ndarray, angle: float, expand: bool = False, preserve_origin_shape: bool = False, ) -> np.ndarray: """Rotate an image counterclockwise by an given angle. Args: image: numpy tensor to rotate angle: rotation angle in degrees, between -90 and +90 expand: whether the image should be padded before the rotation preserve_origin_shape: if expand is set to True, resizes the final output to the original image size Returns: Rotated array, padded by 0 by default. """ # Compute the expanded padding exp_img: np.ndarray if expand: exp_shape = compute_expanded_shape(image.shape[:2], angle) h_pad, w_pad = ( int(max(0, ceil(exp_shape[0] - image.shape[0]))), int(max(0, ceil(exp_shape[1] - image.shape[1]))), ) exp_img = np.pad(image, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) else: exp_img = image height, width = exp_img.shape[:2] rot_mat = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0) rot_img = cv2.warpAffine(exp_img, rot_mat, (width, height)) if expand: # Pad to get the same aspect ratio if (image.shape[0] / image.shape[1]) != (rot_img.shape[0] / rot_img.shape[1]): # Pad width if (rot_img.shape[0] / rot_img.shape[1]) > (image.shape[0] / image.shape[1]): h_pad, w_pad = 0, int(rot_img.shape[0] * image.shape[1] / image.shape[0] - rot_img.shape[1]) # Pad height else: h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0 rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) if preserve_origin_shape: # rescale rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR) return rot_img def remove_image_padding(image: np.ndarray) -> np.ndarray: """Remove black border padding from an image Args: image: numpy tensor to remove padding from Returns: Image with padding removed """ # Find the bounding box of the non-black region rows = np.any(image, axis=1) cols = np.any(image, axis=0) rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] return image[rmin : rmax + 1, cmin : cmax + 1] def estimate_page_angle(polys: np.ndarray) -> float: """Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the estimated angle ccw in degrees """ # Compute mean left points and mean right point with respect to the reading direction (oriented polygon) xleft = polys[:, 0, 0] + polys[:, 3, 0] yleft = polys[:, 0, 1] + polys[:, 3, 1] xright = polys[:, 1, 0] + polys[:, 2, 0] yright = polys[:, 1, 1] + polys[:, 2, 1] with np.errstate(divide="raise", invalid="raise"): try: return float( np.median(np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi) # Y axis from top to bottom! ) except FloatingPointError: return 0.0 def convert_to_relative_coords(geoms: np.ndarray, img_shape: tuple[int, int]) -> np.ndarray: """Convert a geometry to relative coordinates Args: geoms: a set of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) img_shape: the height and width of the image Returns: the updated geometry """ # Polygon if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): polygons: np.ndarray = np.empty(geoms.shape, dtype=np.float32) polygons[..., 0] = geoms[..., 0] / img_shape[1] polygons[..., 1] = geoms[..., 1] / img_shape[0] return polygons.clip(0, 1) if geoms.ndim == 2 and geoms.shape[1] == 4: boxes: np.ndarray = np.empty(geoms.shape, dtype=np.float32) boxes[:, ::2] = geoms[:, ::2] / img_shape[1] boxes[:, 1::2] = geoms[:, 1::2] / img_shape[0] return boxes.clip(0, 1) raise ValueError(f"invalid format for arg `geoms`: {geoms.shape}") def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]: """Created cropped images from list of bounding boxes Args: img: input image boxes: bounding boxes of shape (N, 4) where N is the number of boxes, and the relative coordinates (xmin, ymin, xmax, ymax) Returns: list of cropped images """ if boxes.shape[0] == 0: return [] if boxes.shape[1] != 4: raise AssertionError("boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)") # Project relative coordinates _boxes = boxes.copy() h, w = img.shape[:2] if not np.issubdtype(_boxes.dtype, np.integer): _boxes[:, [0, 2]] *= w _boxes[:, [1, 3]] *= h _boxes = _boxes.round().astype(int) # Add last index _boxes[2:] += 1 return deepcopy([img[box[1] : box[3], box[0] : box[2]] for box in _boxes]) def extract_rcrops( img: np.ndarray, polys: np.ndarray, dtype=np.float32, assume_horizontal: bool = False ) -> list[np.ndarray]: """Created cropped images from list of rotated bounding boxes Args: img: input image polys: bounding boxes of shape (N, 4, 2) dtype: target data type of bounding boxes assume_horizontal: whether the boxes are assumed to be only horizontally oriented Returns: list of cropped images """ if polys.shape[0] == 0: return [] if polys.shape[1:] != (4, 2): raise AssertionError("polys are expected to be quadrilateral, of shape (N, 4, 2)") # Project relative coordinates _boxes = polys.copy() height, width = img.shape[:2] if not np.issubdtype(_boxes.dtype, np.integer): _boxes[:, :, 0] *= width _boxes[:, :, 1] *= height src_img = img # Handle only horizontal oriented boxes if assume_horizontal: crops = [] for box in _boxes: # Calculate the centroid of the quadrilateral centroid = np.mean(box, axis=0) # Divide the points into left and right left_points = box[box[:, 0] < centroid[0]] right_points = box[box[:, 0] >= centroid[0]] # Sort the left points according to the y-axis left_points = left_points[np.argsort(left_points[:, 1])] top_left_pt = left_points[0] bottom_left_pt = left_points[-1] # Sort the right points according to the y-axis right_points = right_points[np.argsort(right_points[:, 1])] top_right_pt = right_points[0] bottom_right_pt = right_points[-1] box_points = np.array( [top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt], dtype=dtype, ) # Get the width and height of the rectangle that will contain the warped quadrilateral width_upper = np.linalg.norm(top_right_pt - top_left_pt) width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt) height_left = np.linalg.norm(bottom_left_pt - top_left_pt) height_right = np.linalg.norm(bottom_right_pt - top_right_pt) # Get the maximum width and height rect_width = max(int(width_upper), int(width_lower)) rect_height = max(int(height_left), int(height_right)) dst_pts = np.array( [ [0, 0], # top-left # bottom-left [0, rect_height - 1], # top-right [rect_width - 1, 0], # bottom-right [rect_width - 1, rect_height - 1], ], dtype=dtype, ) # Get the perspective transform matrix using the box points affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts) # Perform the perspective warp to get the rectified crop crop = cv2.warpPerspective( src_img, affine_mat, (rect_width, rect_height), ) # Add the crop to the list of crops crops.append(crop) # Handle any oriented boxes else: src_pts = _boxes[:, :3].astype(np.float32) # Preserve size d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1) d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1) # (N, 3, 2) dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype) dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1 dst_pts[:, 2, 1] = d2 - 1 # Use a warp transformation to extract the crop crops = [ cv2.warpAffine( src_img, # Transformation matrix cv2.getAffineTransform(src_pts[idx], dst_pts[idx]), (int(d1[idx]), int(d2[idx])), ) for idx in range(_boxes.shape[0]) ] return crops