dots-ocr / dots_ocr /utils /doc_utils.py
redhairedshanks1's picture
Upload 61 files
b56e481 verified
import fitz
import numpy as np
import enum
from pydantic import BaseModel, Field
from PIL import Image
class SupportedPdfParseMethod(enum.Enum):
OCR = 'ocr'
TXT = 'txt'
class PageInfo(BaseModel):
"""The width and height of page
"""
w: float = Field(description='the width of page')
h: float = Field(description='the height of page')
def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
doc (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
from PIL import Image
mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
pm = doc.get_pixmap(matrix=mat, alpha=False)
if pm.width > 4500 or pm.height > 4500:
mat = fitz.Matrix(72 / 72, 72 / 72) # use fitz default dpi
pm = doc.get_pixmap(matrix=mat, alpha=False)
image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
return image
def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
images = []
with fitz.open(pdf_file) as doc:
pdf_page_num = doc.page_count
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else pdf_page_num - 1
)
if end_page_id > pdf_page_num - 1:
print('end_page_id is out of range, use images length')
end_page_id = pdf_page_num - 1
for index in range(0, doc.page_count):
if start_page_id <= index <= end_page_id:
page = doc[index]
img = fitz_doc_to_image(page, target_dpi=dpi)
images.append(img)
return images