Spaces:

redhairedshanks1
/

dots-ocr

Runtime error

App Files Files Community

dots-ocr / dots_ocr /utils /doc_utils.py

redhairedshanks1

Upload 61 files

b56e481 verified 5 months ago

raw

history blame contribute delete

1.9 kB

	import fitz
	import numpy as np
	import enum
	from pydantic import BaseModel, Field
	from PIL import Image


	class SupportedPdfParseMethod(enum.Enum):
	OCR = 'ocr'
	TXT = 'txt'


	class PageInfo(BaseModel):
	"""The width and height of page
	"""
	w: float = Field(description='the width of page')
	h: float = Field(description='the height of page')


	def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict:
	"""Convert fitz.Document to image, Then convert the image to numpy array.

	Args:
	doc (_type_): pymudoc page
	dpi (int, optional): reset the dpi of dpi. Defaults to 200.

	Returns:
	dict: {'img': numpy array, 'width': width, 'height': height }
	"""
	from PIL import Image
	mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
	pm = doc.get_pixmap(matrix=mat, alpha=False)

	if pm.width > 4500 or pm.height > 4500:
	mat = fitz.Matrix(72 / 72, 72 / 72) # use fitz default dpi
	pm = doc.get_pixmap(matrix=mat, alpha=False)

	image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
	return image


	def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
	images = []
	with fitz.open(pdf_file) as doc:
	pdf_page_num = doc.page_count
	end_page_id = (
	end_page_id
	if end_page_id is not None and end_page_id >= 0
	else pdf_page_num - 1
	)
	if end_page_id > pdf_page_num - 1:
	print('end_page_id is out of range, use images length')
	end_page_id = pdf_page_num - 1

	for index in range(0, doc.page_count):
	if start_page_id <= index <= end_page_id:
	page = doc[index]
	img = fitz_doc_to_image(page, target_dpi=dpi)
	images.append(img)
	return images