MikeTrizna's picture
Upload folder using huggingface_hub
f3270e6 verified
# Copyright (C) 2021-2025, Mindee.
# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
from copy import deepcopy
from math import ceil
import cv2
import numpy as np
from .common_types import BoundingBox, Polygon4P
__all__ = [
"bbox_to_polygon",
"polygon_to_bbox",
"resolve_enclosing_bbox",
"resolve_enclosing_rbbox",
"rotate_boxes",
"compute_expanded_shape",
"rotate_image",
"remove_image_padding",
"estimate_page_angle",
"convert_to_relative_coords",
"rotate_abs_geoms",
"extract_crops",
"extract_rcrops",
"detach_scores",
]
def bbox_to_polygon(bbox: BoundingBox) -> Polygon4P:
"""Convert a bounding box to a polygon
Args:
bbox: a bounding box
Returns:
a polygon
"""
return bbox[0], (bbox[1][0], bbox[0][1]), (bbox[0][0], bbox[1][1]), bbox[1]
def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
"""Convert a polygon to a bounding box
Args:
polygon: a polygon
Returns:
a bounding box
"""
x, y = zip(*polygon)
return (min(x), min(y)), (max(x), max(y))
def detach_scores(boxes: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]:
"""Detach the objectness scores from box predictions
Args:
boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)
Returns:
a tuple of two lists: the first one contains the boxes without the objectness scores,
the second one contains the objectness scores
"""
def _detach(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
if boxes.ndim == 2:
return boxes[:, :-1], boxes[:, -1]
return boxes[:, :-1], boxes[:, -1, -1]
loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
return list(loc_preds), list(obj_scores)
def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBox | np.ndarray:
"""Compute enclosing bbox either from:
Args:
bboxes: boxes in one of the following formats:
- an array of boxes: (*, 4), where boxes have this shape:
(xmin, ymin, xmax, ymax)
- a list of BoundingBox
Returns:
a (1, 4) array (enclosing boxarray), or a BoundingBox
"""
if isinstance(bboxes, np.ndarray):
xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
else:
x, y = zip(*[point for box in bboxes for point in box])
return (min(x), min(y)), (max(x), max(y))
def resolve_enclosing_rbbox(rbboxes: list[np.ndarray], intermed_size: int = 1024) -> np.ndarray:
"""Compute enclosing rotated bbox either from:
Args:
rbboxes: boxes in one of the following formats:
- an array of boxes: (*, 4, 2), where boxes have this shape:
(x1, y1), (x2, y2), (x3, y3), (x4, y4)
- a list of BoundingBox
intermed_size: size of the intermediate image
Returns:
a (4, 2) array (enclosing rotated box)
"""
cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
# Convert to absolute for minAreaRect
cloud *= intermed_size
rect = cv2.minAreaRect(cloud.astype(np.int32))
return cv2.boxPoints(rect) / intermed_size
def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
"""Rotate points counter-clockwise.
Args:
points: array of size (N, 2)
angle: angle between -90 and +90 degrees
Returns:
Rotated points
"""
angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
rotation_mat = np.array(
[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=points.dtype
)
return np.matmul(points, rotation_mat.T)
def compute_expanded_shape(img_shape: tuple[int, int], angle: float) -> tuple[int, int]:
"""Compute the shape of an expanded rotated image
Args:
img_shape: the height and width of the image
angle: angle between -90 and +90 degrees
Returns:
the height and width of the rotated image
"""
points: np.ndarray = np.array([
[img_shape[1] / 2, img_shape[0] / 2],
[-img_shape[1] / 2, img_shape[0] / 2],
])
rotated_points = rotate_abs_points(points, angle)
wh_shape = 2 * np.abs(rotated_points).max(axis=0)
return wh_shape[1], wh_shape[0]
def rotate_abs_geoms(
geoms: np.ndarray,
angle: float,
img_shape: tuple[int, int],
expand: bool = True,
) -> np.ndarray:
"""Rotate a batch of bounding boxes or polygons by an angle around the
image center.
Args:
geoms: (N, 4) or (N, 4, 2) array of ABSOLUTE coordinate boxes
angle: anti-clockwise rotation angle in degrees
img_shape: the height and width of the image
expand: whether the image should be padded to avoid information loss
Returns:
A batch of rotated polygons (N, 4, 2)
"""
# Switch to polygons
polys = (
np.stack([geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], axis=1)
if geoms.ndim == 2
else geoms
)
polys = polys.astype(np.float32)
# Switch to image center as referential
polys[..., 0] -= img_shape[1] / 2
polys[..., 1] = img_shape[0] / 2 - polys[..., 1]
# Rotated them around image center
rotated_polys = rotate_abs_points(polys.reshape(-1, 2), angle).reshape(-1, 4, 2)
# Switch back to top-left corner as referential
target_shape = compute_expanded_shape(img_shape, angle) if expand else img_shape
# Clip coords to fit since there is no expansion
rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(0, target_shape[1])
rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(0, target_shape[0])
return rotated_polys
def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]) -> np.ndarray:
"""Remaps a batch of rotated locpred (N, 4, 2) expressed for an origin_shape to a destination_shape.
This does not impact the absolute shape of the boxes, but allow to calculate the new relative RotatedBbox
coordinates after a resizing of the image.
Args:
loc_preds: (N, 4, 2) array of RELATIVE loc_preds
orig_shape: shape of the origin image
dest_shape: shape of the destination image
Returns:
A batch of rotated loc_preds (N, 4, 2) expressed in the destination referencial
"""
if len(dest_shape) != 2:
raise ValueError(f"Mask length should be 2, was found at: {len(dest_shape)}")
if len(orig_shape) != 2:
raise ValueError(f"Image_shape length should be 2, was found at: {len(orig_shape)}")
orig_height, orig_width = orig_shape
dest_height, dest_width = dest_shape
mboxes = loc_preds.copy()
mboxes[:, :, 0] = ((loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2) / dest_width
mboxes[:, :, 1] = ((loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2) / dest_height
return mboxes
def rotate_boxes(
loc_preds: np.ndarray,
angle: float,
orig_shape: tuple[int, int],
min_angle: float = 1.0,
target_shape: tuple[int, int] | None = None,
) -> np.ndarray:
"""Rotate a batch of straight bounding boxes (xmin, ymin, xmax, ymax, c) or rotated bounding boxes
(4, 2) of an angle, if angle > min_angle, around the center of the page.
If target_shape is specified, the boxes are remapped to the target shape after the rotation. This
is done to remove the padding that is created by rotate_page(expand=True)
Args:
loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
angle: angle between -90 and +90 degrees
orig_shape: shape of the origin image
min_angle: minimum angle to rotate boxes
target_shape: shape of the destination image
Returns:
A batch of rotated boxes (N, 4, 2): or a batch of straight bounding boxes
"""
# Change format of the boxes to rotated boxes
_boxes = loc_preds.copy()
if _boxes.ndim == 2:
_boxes = np.stack(
[
_boxes[:, [0, 1]],
_boxes[:, [2, 1]],
_boxes[:, [2, 3]],
_boxes[:, [0, 3]],
],
axis=1,
)
# If small angle, return boxes (no rotation)
if abs(angle) < min_angle or abs(angle) > 90 - min_angle:
return _boxes
# Compute rotation matrix
angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
rotation_mat = np.array(
[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=_boxes.dtype
)
# Rotate absolute points
points: np.ndarray = np.stack((_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1)
image_center = (orig_shape[1] / 2, orig_shape[0] / 2)
rotated_points = image_center + np.matmul(points - image_center, rotation_mat)
rotated_boxes: np.ndarray = np.stack(
(rotated_points[:, :, 0] / orig_shape[1], rotated_points[:, :, 1] / orig_shape[0]), axis=-1
)
# Apply a mask if requested
if target_shape is not None:
rotated_boxes = remap_boxes(rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape)
return rotated_boxes
def rotate_image(
image: np.ndarray,
angle: float,
expand: bool = False,
preserve_origin_shape: bool = False,
) -> np.ndarray:
"""Rotate an image counterclockwise by an given angle.
Args:
image: numpy tensor to rotate
angle: rotation angle in degrees, between -90 and +90
expand: whether the image should be padded before the rotation
preserve_origin_shape: if expand is set to True, resizes the final output to the original image size
Returns:
Rotated array, padded by 0 by default.
"""
# Compute the expanded padding
exp_img: np.ndarray
if expand:
exp_shape = compute_expanded_shape(image.shape[:2], angle)
h_pad, w_pad = (
int(max(0, ceil(exp_shape[0] - image.shape[0]))),
int(max(0, ceil(exp_shape[1] - image.shape[1]))),
)
exp_img = np.pad(image, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
else:
exp_img = image
height, width = exp_img.shape[:2]
rot_mat = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0)
rot_img = cv2.warpAffine(exp_img, rot_mat, (width, height))
if expand:
# Pad to get the same aspect ratio
if (image.shape[0] / image.shape[1]) != (rot_img.shape[0] / rot_img.shape[1]):
# Pad width
if (rot_img.shape[0] / rot_img.shape[1]) > (image.shape[0] / image.shape[1]):
h_pad, w_pad = 0, int(rot_img.shape[0] * image.shape[1] / image.shape[0] - rot_img.shape[1])
# Pad height
else:
h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
if preserve_origin_shape:
# rescale
rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
return rot_img
def remove_image_padding(image: np.ndarray) -> np.ndarray:
"""Remove black border padding from an image
Args:
image: numpy tensor to remove padding from
Returns:
Image with padding removed
"""
# Find the bounding box of the non-black region
rows = np.any(image, axis=1)
cols = np.any(image, axis=0)
rmin, rmax = np.where(rows)[0][[0, -1]]
cmin, cmax = np.where(cols)[0][[0, -1]]
return image[rmin : rmax + 1, cmin : cmax + 1]
def estimate_page_angle(polys: np.ndarray) -> float:
"""Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the
estimated angle ccw in degrees
"""
# Compute mean left points and mean right point with respect to the reading direction (oriented polygon)
xleft = polys[:, 0, 0] + polys[:, 3, 0]
yleft = polys[:, 0, 1] + polys[:, 3, 1]
xright = polys[:, 1, 0] + polys[:, 2, 0]
yright = polys[:, 1, 1] + polys[:, 2, 1]
with np.errstate(divide="raise", invalid="raise"):
try:
return float(
np.median(np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi) # Y axis from top to bottom!
)
except FloatingPointError:
return 0.0
def convert_to_relative_coords(geoms: np.ndarray, img_shape: tuple[int, int]) -> np.ndarray:
"""Convert a geometry to relative coordinates
Args:
geoms: a set of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4)
img_shape: the height and width of the image
Returns:
the updated geometry
"""
# Polygon
if geoms.ndim == 3 and geoms.shape[1:] == (4, 2):
polygons: np.ndarray = np.empty(geoms.shape, dtype=np.float32)
polygons[..., 0] = geoms[..., 0] / img_shape[1]
polygons[..., 1] = geoms[..., 1] / img_shape[0]
return polygons.clip(0, 1)
if geoms.ndim == 2 and geoms.shape[1] == 4:
boxes: np.ndarray = np.empty(geoms.shape, dtype=np.float32)
boxes[:, ::2] = geoms[:, ::2] / img_shape[1]
boxes[:, 1::2] = geoms[:, 1::2] / img_shape[0]
return boxes.clip(0, 1)
raise ValueError(f"invalid format for arg `geoms`: {geoms.shape}")
def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]:
"""Created cropped images from list of bounding boxes
Args:
img: input image
boxes: bounding boxes of shape (N, 4) where N is the number of boxes, and the relative
coordinates (xmin, ymin, xmax, ymax)
Returns:
list of cropped images
"""
if boxes.shape[0] == 0:
return []
if boxes.shape[1] != 4:
raise AssertionError("boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)")
# Project relative coordinates
_boxes = boxes.copy()
h, w = img.shape[:2]
if not np.issubdtype(_boxes.dtype, np.integer):
_boxes[:, [0, 2]] *= w
_boxes[:, [1, 3]] *= h
_boxes = _boxes.round().astype(int)
# Add last index
_boxes[2:] += 1
return deepcopy([img[box[1] : box[3], box[0] : box[2]] for box in _boxes])
def extract_rcrops(
img: np.ndarray, polys: np.ndarray, dtype=np.float32, assume_horizontal: bool = False
) -> list[np.ndarray]:
"""Created cropped images from list of rotated bounding boxes
Args:
img: input image
polys: bounding boxes of shape (N, 4, 2)
dtype: target data type of bounding boxes
assume_horizontal: whether the boxes are assumed to be only horizontally oriented
Returns:
list of cropped images
"""
if polys.shape[0] == 0:
return []
if polys.shape[1:] != (4, 2):
raise AssertionError("polys are expected to be quadrilateral, of shape (N, 4, 2)")
# Project relative coordinates
_boxes = polys.copy()
height, width = img.shape[:2]
if not np.issubdtype(_boxes.dtype, np.integer):
_boxes[:, :, 0] *= width
_boxes[:, :, 1] *= height
src_img = img
# Handle only horizontal oriented boxes
if assume_horizontal:
crops = []
for box in _boxes:
# Calculate the centroid of the quadrilateral
centroid = np.mean(box, axis=0)
# Divide the points into left and right
left_points = box[box[:, 0] < centroid[0]]
right_points = box[box[:, 0] >= centroid[0]]
# Sort the left points according to the y-axis
left_points = left_points[np.argsort(left_points[:, 1])]
top_left_pt = left_points[0]
bottom_left_pt = left_points[-1]
# Sort the right points according to the y-axis
right_points = right_points[np.argsort(right_points[:, 1])]
top_right_pt = right_points[0]
bottom_right_pt = right_points[-1]
box_points = np.array(
[top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt],
dtype=dtype,
)
# Get the width and height of the rectangle that will contain the warped quadrilateral
width_upper = np.linalg.norm(top_right_pt - top_left_pt)
width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt)
height_left = np.linalg.norm(bottom_left_pt - top_left_pt)
height_right = np.linalg.norm(bottom_right_pt - top_right_pt)
# Get the maximum width and height
rect_width = max(int(width_upper), int(width_lower))
rect_height = max(int(height_left), int(height_right))
dst_pts = np.array(
[
[0, 0], # top-left
# bottom-left
[0, rect_height - 1],
# top-right
[rect_width - 1, 0],
# bottom-right
[rect_width - 1, rect_height - 1],
],
dtype=dtype,
)
# Get the perspective transform matrix using the box points
affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts)
# Perform the perspective warp to get the rectified crop
crop = cv2.warpPerspective(
src_img,
affine_mat,
(rect_width, rect_height),
)
# Add the crop to the list of crops
crops.append(crop)
# Handle any oriented boxes
else:
src_pts = _boxes[:, :3].astype(np.float32)
# Preserve size
d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
# (N, 3, 2)
dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
dst_pts[:, 2, 1] = d2 - 1
# Use a warp transformation to extract the crop
crops = [
cv2.warpAffine(
src_img,
# Transformation matrix
cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
(int(d1[idx]), int(d2[idx])),
)
for idx in range(_boxes.shape[0])
]
return crops