Spaces:

MikeTrizna
/

doctr_demo_fork

Running

App Files Files Community

doctr_demo_fork / src /python-doctr /doctr /utils /geometry.py

MikeTrizna

Upload folder using huggingface_hub

f3270e6 verified 5 months ago

raw

history blame contribute delete

18.7 kB

	# Copyright (C) 2021-2025, Mindee.

	# This program is licensed under the Apache License 2.0.
	# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

	from copy import deepcopy
	from math import ceil

	import cv2
	import numpy as np

	from .common_types import BoundingBox, Polygon4P

	__all__ = [
	"bbox_to_polygon",
	"polygon_to_bbox",
	"resolve_enclosing_bbox",
	"resolve_enclosing_rbbox",
	"rotate_boxes",
	"compute_expanded_shape",
	"rotate_image",
	"remove_image_padding",
	"estimate_page_angle",
	"convert_to_relative_coords",
	"rotate_abs_geoms",
	"extract_crops",
	"extract_rcrops",
	"detach_scores",
	]


	def bbox_to_polygon(bbox: BoundingBox) -> Polygon4P:
	"""Convert a bounding box to a polygon

	Args:
	bbox: a bounding box

	Returns:
	a polygon
	"""
	return bbox[0], (bbox[1][0], bbox[0][1]), (bbox[0][0], bbox[1][1]), bbox[1]


	def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
	"""Convert a polygon to a bounding box

	Args:
	polygon: a polygon

	Returns:
	a bounding box
	"""
	x, y = zip(*polygon)
	return (min(x), min(y)), (max(x), max(y))


	def detach_scores(boxes: list[np.ndarray]) -> tuple[list[np.ndarray], list[np.ndarray]]:
	"""Detach the objectness scores from box predictions

	Args:
	boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)

	Returns:
	a tuple of two lists: the first one contains the boxes without the objectness scores,
	the second one contains the objectness scores
	"""

	def _detach(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
	if boxes.ndim == 2:
	return boxes[:, :-1], boxes[:, -1]
	return boxes[:, :-1], boxes[:, -1, -1]

	loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
	return list(loc_preds), list(obj_scores)


	def resolve_enclosing_bbox(bboxes: list[BoundingBox] \| np.ndarray) -> BoundingBox \| np.ndarray:
	"""Compute enclosing bbox either from:

	Args:
	bboxes: boxes in one of the following formats:

	- an array of boxes: (*, 4), where boxes have this shape:
	(xmin, ymin, xmax, ymax)

	- a list of BoundingBox

	Returns:
	a (1, 4) array (enclosing boxarray), or a BoundingBox
	"""
	if isinstance(bboxes, np.ndarray):
	xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
	return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
	else:
	x, y = zip(*[point for box in bboxes for point in box])
	return (min(x), min(y)), (max(x), max(y))


	def resolve_enclosing_rbbox(rbboxes: list[np.ndarray], intermed_size: int = 1024) -> np.ndarray:
	"""Compute enclosing rotated bbox either from:

	Args:
	rbboxes: boxes in one of the following formats:

	- an array of boxes: (*, 4, 2), where boxes have this shape:
	(x1, y1), (x2, y2), (x3, y3), (x4, y4)

	- a list of BoundingBox
	intermed_size: size of the intermediate image

	Returns:
	a (4, 2) array (enclosing rotated box)
	"""
	cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
	# Convert to absolute for minAreaRect
	cloud *= intermed_size
	rect = cv2.minAreaRect(cloud.astype(np.int32))
	return cv2.boxPoints(rect) / intermed_size


	def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
	"""Rotate points counter-clockwise.

	Args:
	points: array of size (N, 2)
	angle: angle between -90 and +90 degrees

	Returns:
	Rotated points
	"""
	angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
	rotation_mat = np.array(
	[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=points.dtype
	)
	return np.matmul(points, rotation_mat.T)


	def compute_expanded_shape(img_shape: tuple[int, int], angle: float) -> tuple[int, int]:
	"""Compute the shape of an expanded rotated image

	Args:
	img_shape: the height and width of the image
	angle: angle between -90 and +90 degrees

	Returns:
	the height and width of the rotated image
	"""
	points: np.ndarray = np.array([
	[img_shape[1] / 2, img_shape[0] / 2],
	[-img_shape[1] / 2, img_shape[0] / 2],
	])

	rotated_points = rotate_abs_points(points, angle)

	wh_shape = 2 * np.abs(rotated_points).max(axis=0)
	return wh_shape[1], wh_shape[0]


	def rotate_abs_geoms(
	geoms: np.ndarray,
	angle: float,
	img_shape: tuple[int, int],
	expand: bool = True,
	) -> np.ndarray:
	"""Rotate a batch of bounding boxes or polygons by an angle around the
	image center.

	Args:
	geoms: (N, 4) or (N, 4, 2) array of ABSOLUTE coordinate boxes
	angle: anti-clockwise rotation angle in degrees
	img_shape: the height and width of the image
	expand: whether the image should be padded to avoid information loss

	Returns:
	A batch of rotated polygons (N, 4, 2)
	"""
	# Switch to polygons
	polys = (
	np.stack([geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], axis=1)
	if geoms.ndim == 2
	else geoms
	)
	polys = polys.astype(np.float32)

	# Switch to image center as referential
	polys[..., 0] -= img_shape[1] / 2
	polys[..., 1] = img_shape[0] / 2 - polys[..., 1]

	# Rotated them around image center
	rotated_polys = rotate_abs_points(polys.reshape(-1, 2), angle).reshape(-1, 4, 2)
	# Switch back to top-left corner as referential
	target_shape = compute_expanded_shape(img_shape, angle) if expand else img_shape
	# Clip coords to fit since there is no expansion
	rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(0, target_shape[1])
	rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(0, target_shape[0])

	return rotated_polys


	def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]) -> np.ndarray:
	"""Remaps a batch of rotated locpred (N, 4, 2) expressed for an origin_shape to a destination_shape.
	This does not impact the absolute shape of the boxes, but allow to calculate the new relative RotatedBbox
	coordinates after a resizing of the image.

	Args:
	loc_preds: (N, 4, 2) array of RELATIVE loc_preds
	orig_shape: shape of the origin image
	dest_shape: shape of the destination image

	Returns:
	A batch of rotated loc_preds (N, 4, 2) expressed in the destination referencial
	"""
	if len(dest_shape) != 2:
	raise ValueError(f"Mask length should be 2, was found at: {len(dest_shape)}")
	if len(orig_shape) != 2:
	raise ValueError(f"Image_shape length should be 2, was found at: {len(orig_shape)}")
	orig_height, orig_width = orig_shape
	dest_height, dest_width = dest_shape
	mboxes = loc_preds.copy()
	mboxes[:, :, 0] = ((loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2) / dest_width
	mboxes[:, :, 1] = ((loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2) / dest_height

	return mboxes


	def rotate_boxes(
	loc_preds: np.ndarray,
	angle: float,
	orig_shape: tuple[int, int],
	min_angle: float = 1.0,
	target_shape: tuple[int, int] \| None = None,
	) -> np.ndarray:
	"""Rotate a batch of straight bounding boxes (xmin, ymin, xmax, ymax, c) or rotated bounding boxes
	(4, 2) of an angle, if angle > min_angle, around the center of the page.
	If target_shape is specified, the boxes are remapped to the target shape after the rotation. This
	is done to remove the padding that is created by rotate_page(expand=True)

	Args:
	loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
	angle: angle between -90 and +90 degrees
	orig_shape: shape of the origin image
	min_angle: minimum angle to rotate boxes
	target_shape: shape of the destination image

	Returns:
	A batch of rotated boxes (N, 4, 2): or a batch of straight bounding boxes
	"""
	# Change format of the boxes to rotated boxes
	_boxes = loc_preds.copy()
	if _boxes.ndim == 2:
	_boxes = np.stack(
	[
	_boxes[:, [0, 1]],
	_boxes[:, [2, 1]],
	_boxes[:, [2, 3]],
	_boxes[:, [0, 3]],
	],
	axis=1,
	)
	# If small angle, return boxes (no rotation)
	if abs(angle) < min_angle or abs(angle) > 90 - min_angle:
	return _boxes
	# Compute rotation matrix
	angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
	rotation_mat = np.array(
	[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=_boxes.dtype
	)
	# Rotate absolute points
	points: np.ndarray = np.stack((_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1)
	image_center = (orig_shape[1] / 2, orig_shape[0] / 2)
	rotated_points = image_center + np.matmul(points - image_center, rotation_mat)
	rotated_boxes: np.ndarray = np.stack(
	(rotated_points[:, :, 0] / orig_shape[1], rotated_points[:, :, 1] / orig_shape[0]), axis=-1
	)

	# Apply a mask if requested
	if target_shape is not None:
	rotated_boxes = remap_boxes(rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape)

	return rotated_boxes


	def rotate_image(
	image: np.ndarray,
	angle: float,
	expand: bool = False,
	preserve_origin_shape: bool = False,
	) -> np.ndarray:
	"""Rotate an image counterclockwise by an given angle.

	Args:
	image: numpy tensor to rotate
	angle: rotation angle in degrees, between -90 and +90
	expand: whether the image should be padded before the rotation
	preserve_origin_shape: if expand is set to True, resizes the final output to the original image size

	Returns:
	Rotated array, padded by 0 by default.
	"""
	# Compute the expanded padding
	exp_img: np.ndarray
	if expand:
	exp_shape = compute_expanded_shape(image.shape[:2], angle)
	h_pad, w_pad = (
	int(max(0, ceil(exp_shape[0] - image.shape[0]))),
	int(max(0, ceil(exp_shape[1] - image.shape[1]))),
	)
	exp_img = np.pad(image, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
	else:
	exp_img = image

	height, width = exp_img.shape[:2]
	rot_mat = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0)
	rot_img = cv2.warpAffine(exp_img, rot_mat, (width, height))
	if expand:
	# Pad to get the same aspect ratio
	if (image.shape[0] / image.shape[1]) != (rot_img.shape[0] / rot_img.shape[1]):
	# Pad width
	if (rot_img.shape[0] / rot_img.shape[1]) > (image.shape[0] / image.shape[1]):
	h_pad, w_pad = 0, int(rot_img.shape[0] * image.shape[1] / image.shape[0] - rot_img.shape[1])
	# Pad height
	else:
	h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
	rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
	if preserve_origin_shape:
	# rescale
	rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)

	return rot_img


	def remove_image_padding(image: np.ndarray) -> np.ndarray:
	"""Remove black border padding from an image

	Args:
	image: numpy tensor to remove padding from

	Returns:
	Image with padding removed
	"""
	# Find the bounding box of the non-black region
	rows = np.any(image, axis=1)
	cols = np.any(image, axis=0)
	rmin, rmax = np.where(rows)[0][[0, -1]]
	cmin, cmax = np.where(cols)[0][[0, -1]]

	return image[rmin : rmax + 1, cmin : cmax + 1]


	def estimate_page_angle(polys: np.ndarray) -> float:
	"""Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the
	estimated angle ccw in degrees
	"""
	# Compute mean left points and mean right point with respect to the reading direction (oriented polygon)
	xleft = polys[:, 0, 0] + polys[:, 3, 0]
	yleft = polys[:, 0, 1] + polys[:, 3, 1]
	xright = polys[:, 1, 0] + polys[:, 2, 0]
	yright = polys[:, 1, 1] + polys[:, 2, 1]
	with np.errstate(divide="raise", invalid="raise"):
	try:
	return float(
	np.median(np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi) # Y axis from top to bottom!
	)
	except FloatingPointError:
	return 0.0


	def convert_to_relative_coords(geoms: np.ndarray, img_shape: tuple[int, int]) -> np.ndarray:
	"""Convert a geometry to relative coordinates

	Args:
	geoms: a set of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4)
	img_shape: the height and width of the image

	Returns:
	the updated geometry
	"""
	# Polygon
	if geoms.ndim == 3 and geoms.shape[1:] == (4, 2):
	polygons: np.ndarray = np.empty(geoms.shape, dtype=np.float32)
	polygons[..., 0] = geoms[..., 0] / img_shape[1]
	polygons[..., 1] = geoms[..., 1] / img_shape[0]
	return polygons.clip(0, 1)
	if geoms.ndim == 2 and geoms.shape[1] == 4:
	boxes: np.ndarray = np.empty(geoms.shape, dtype=np.float32)
	boxes[:, ::2] = geoms[:, ::2] / img_shape[1]
	boxes[:, 1::2] = geoms[:, 1::2] / img_shape[0]
	return boxes.clip(0, 1)

	raise ValueError(f"invalid format for arg `geoms`: {geoms.shape}")


	def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]:
	"""Created cropped images from list of bounding boxes

	Args:
	img: input image
	boxes: bounding boxes of shape (N, 4) where N is the number of boxes, and the relative
	coordinates (xmin, ymin, xmax, ymax)

	Returns:
	list of cropped images
	"""
	if boxes.shape[0] == 0:
	return []
	if boxes.shape[1] != 4:
	raise AssertionError("boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)")

	# Project relative coordinates
	_boxes = boxes.copy()
	h, w = img.shape[:2]
	if not np.issubdtype(_boxes.dtype, np.integer):
	_boxes[:, [0, 2]] *= w
	_boxes[:, [1, 3]] *= h
	_boxes = _boxes.round().astype(int)
	# Add last index
	_boxes[2:] += 1

	return deepcopy([img[box[1] : box[3], box[0] : box[2]] for box in _boxes])


	def extract_rcrops(
	img: np.ndarray, polys: np.ndarray, dtype=np.float32, assume_horizontal: bool = False
	) -> list[np.ndarray]:
	"""Created cropped images from list of rotated bounding boxes

	Args:
	img: input image
	polys: bounding boxes of shape (N, 4, 2)
	dtype: target data type of bounding boxes
	assume_horizontal: whether the boxes are assumed to be only horizontally oriented

	Returns:
	list of cropped images
	"""
	if polys.shape[0] == 0:
	return []
	if polys.shape[1:] != (4, 2):
	raise AssertionError("polys are expected to be quadrilateral, of shape (N, 4, 2)")

	# Project relative coordinates
	_boxes = polys.copy()
	height, width = img.shape[:2]
	if not np.issubdtype(_boxes.dtype, np.integer):
	_boxes[:, :, 0] *= width
	_boxes[:, :, 1] *= height

	src_img = img

	# Handle only horizontal oriented boxes
	if assume_horizontal:
	crops = []

	for box in _boxes:
	# Calculate the centroid of the quadrilateral
	centroid = np.mean(box, axis=0)

	# Divide the points into left and right
	left_points = box[box[:, 0] < centroid[0]]
	right_points = box[box[:, 0] >= centroid[0]]

	# Sort the left points according to the y-axis
	left_points = left_points[np.argsort(left_points[:, 1])]
	top_left_pt = left_points[0]
	bottom_left_pt = left_points[-1]
	# Sort the right points according to the y-axis
	right_points = right_points[np.argsort(right_points[:, 1])]
	top_right_pt = right_points[0]
	bottom_right_pt = right_points[-1]
	box_points = np.array(
	[top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt],
	dtype=dtype,
	)

	# Get the width and height of the rectangle that will contain the warped quadrilateral
	width_upper = np.linalg.norm(top_right_pt - top_left_pt)
	width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt)
	height_left = np.linalg.norm(bottom_left_pt - top_left_pt)
	height_right = np.linalg.norm(bottom_right_pt - top_right_pt)

	# Get the maximum width and height
	rect_width = max(int(width_upper), int(width_lower))
	rect_height = max(int(height_left), int(height_right))

	dst_pts = np.array(
	[
	[0, 0], # top-left
	# bottom-left
	[0, rect_height - 1],
	# top-right
	[rect_width - 1, 0],
	# bottom-right
	[rect_width - 1, rect_height - 1],
	],
	dtype=dtype,
	)

	# Get the perspective transform matrix using the box points
	affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts)

	# Perform the perspective warp to get the rectified crop
	crop = cv2.warpPerspective(
	src_img,
	affine_mat,
	(rect_width, rect_height),
	)

	# Add the crop to the list of crops
	crops.append(crop)

	# Handle any oriented boxes
	else:
	src_pts = _boxes[:, :3].astype(np.float32)
	# Preserve size
	d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
	d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
	# (N, 3, 2)
	dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
	dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
	dst_pts[:, 2, 1] = d2 - 1
	# Use a warp transformation to extract the crop
	crops = [
	cv2.warpAffine(
	src_img,
	# Transformation matrix
	cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
	(int(d1[idx]), int(d2[idx])),
	)
	for idx in range(_boxes.shape[0])
	]
	return crops