Spaces:

JaceWei
/

PaperShow

Sleeping

PaperShow / Paper2Video /src /evaluation /PresentQuiz /docling /models /base_ocr_model.py

ZaynZhu

Clean version without large assets

7c08dc3 5 months ago

6.92 kB

	import copy
	import logging
	from abc import abstractmethod
	from pathlib import Path
	from typing import Iterable, List

	import numpy as np
	from docling_core.types.doc import BoundingBox, CoordOrigin
	from PIL import Image, ImageDraw
	from rtree import index
	from scipy.ndimage import binary_dilation, find_objects, label

	from docling.datamodel.base_models import Cell, OcrCell, Page
	from docling.datamodel.document import ConversionResult
	from docling.datamodel.pipeline_options import OcrOptions
	from docling.datamodel.settings import settings
	from docling.models.base_model import BasePageModel

	_log = logging.getLogger(__name__)


	class BaseOcrModel(BasePageModel):
	def __init__(self, enabled: bool, options: OcrOptions):
	self.enabled = enabled
	self.options = options

	# Computes the optimum amount and coordinates of rectangles to OCR on a given page
	def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
	BITMAP_COVERAGE_TRESHOLD = 0.75
	assert page.size is not None

	def find_ocr_rects(size, bitmap_rects):
	image = Image.new(
	"1", (round(size.width), round(size.height))
	) # '1' mode is binary

	# Draw all bitmap rects into a binary image
	draw = ImageDraw.Draw(image)
	for rect in bitmap_rects:
	x0, y0, x1, y1 = rect.as_tuple()
	x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
	draw.rectangle([(x0, y0), (x1, y1)], fill=1)

	np_image = np.array(image)

	# Dilate the image by 10 pixels to merge nearby bitmap rectangles
	structure = np.ones(
	(20, 20)
	) # Create a 20x20 structure element (10 pixels in all directions)
	np_image = binary_dilation(np_image > 0, structure=structure)

	# Find the connected components
	labeled_image, num_features = label(
	np_image > 0
	) # Label black (0 value) regions

	# Find enclosing bounding boxes for each connected component.
	slices = find_objects(labeled_image)
	bounding_boxes = [
	BoundingBox(
	l=slc[1].start,
	t=slc[0].start,
	r=slc[1].stop - 1,
	b=slc[0].stop - 1,
	coord_origin=CoordOrigin.TOPLEFT,
	)
	for slc in slices
	]

	# Compute area fraction on page covered by bitmaps
	area_frac = np.sum(np_image > 0) / (size.width * size.height)

	return (area_frac, bounding_boxes) # fraction covered # boxes

	if page._backend is not None:
	bitmap_rects = page._backend.get_bitmap_rects()
	else:
	bitmap_rects = []
	coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

	# return full-page rectangle if page is dominantly covered with bitmaps
	if self.options.force_full_page_ocr or coverage > max(
	BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
	):
	return [
	BoundingBox(
	l=0,
	t=0,
	r=page.size.width,
	b=page.size.height,
	coord_origin=CoordOrigin.TOPLEFT,
	)
	]
	# return individual rectangles if the bitmap coverage is above the threshold
	elif coverage > self.options.bitmap_area_threshold:
	return ocr_rects
	else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
	return []

	# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
	def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
	# Create R-tree index for programmatic cells
	p = index.Property()
	p.dimension = 2
	idx = index.Index(properties=p)
	for i, cell in enumerate(programmatic_cells):
	idx.insert(i, cell.bbox.as_tuple())

	def is_overlapping_with_existing_cells(ocr_cell):
	# Query the R-tree to get overlapping rectangles
	possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))

	return (
	len(possible_matches_index) > 0
	) # this is a weak criterion but it works.

	filtered_ocr_cells = [
	rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
	]
	return filtered_ocr_cells

	def post_process_cells(self, ocr_cells, programmatic_cells):
	r"""
	Post-process the ocr and programmatic cells and return the final list of of cells
	"""
	if self.options.force_full_page_ocr:
	# If a full page OCR is forced, use only the OCR cells
	cells = [
	Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
	for c_ocr in ocr_cells
	]
	return cells

	## Remove OCR cells which overlap with programmatic cells.
	filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
	programmatic_cells.extend(filtered_ocr_cells)
	return programmatic_cells

	def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
	image = copy.deepcopy(page.image)
	scale_x = image.width / page.size.width
	scale_y = image.height / page.size.height

	draw = ImageDraw.Draw(image, "RGBA")

	# Draw OCR rectangles as yellow filled rect
	for rect in ocr_rects:
	x0, y0, x1, y1 = rect.as_tuple()
	y0 *= scale_x
	y1 *= scale_y
	x0 *= scale_x
	x1 *= scale_x

	shade_color = (255, 255, 0, 40) # transparent yellow
	draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)

	# Draw OCR and programmatic cells
	for tc in page.cells:
	x0, y0, x1, y1 = tc.bbox.as_tuple()
	y0 *= scale_x
	y1 *= scale_y
	x0 *= scale_x
	x1 *= scale_x

	if y1 <= y0:
	y1, y0 = y0, y1

	color = "gray"
	if isinstance(tc, OcrCell):
	color = "magenta"
	draw.rectangle([(x0, y0), (x1, y1)], outline=color)

	if show:
	image.show()
	else:
	out_path: Path = (
	Path(settings.debug.debug_output_path)
	/ f"debug_{conv_res.input.file.stem}"
	)
	out_path.mkdir(parents=True, exist_ok=True)

	out_file = out_path / f"ocr_page_{page.page_no:05}.png"
	image.save(str(out_file), format="png")

	@abstractmethod
	def __call__(
	self, conv_res: ConversionResult, page_batch: Iterable[Page]
	) -> Iterable[Page]:
	pass