Spaces:
Sleeping
Sleeping
| """Responsible for (pre)processing images and PDFs before they are passed to the OCR | |
| engine and other miscellaneous actions concerning processing. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import List | |
| # import cv2 | |
| # import numpy as np | |
| import pyocr | |
| from pdf2image import pdf2image | |
| from PIL import Image #, ImageOps | |
| PDF_CONVERSION_DPI = 300 | |
| ROTATION_CONFIDENCE_THRESHOLD = 2.0 | |
| # def rotate_image(image: Image, angle: float): | |
| # """Rotates the given image by the given angle. | |
| # Args: | |
| # image(PIL.Image.Image): The image to be rotated. | |
| # angle(float): The angle to rotate the image by. | |
| # Returns: The rotated image. | |
| # """ | |
| # image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) | |
| # height, width, _ = image.shape # Get the image height, width, and channels | |
| # # Compute the rotation matrix | |
| # rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1) | |
| # # Apply the rotation to the image | |
| # rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height)) | |
| # rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB)) | |
| # return rotated_image | |
| # class PDF_CONVERTER(enum.Enum): | |
| # PDF2IMAGE = 1 | |
| # IMAGEMAGICK = 2 | |
| def correct_orientation(image: Image.Image) -> Image.Image: | |
| """Corrects the orientation of an image if it is not upright. | |
| Args: | |
| image(PIL.Image.Image): The pillow image to be corrected. | |
| Returns: The corrected pillow image as a copy. The original image is not closed. | |
| """ | |
| if not pyocr.tesseract.is_available(): | |
| raise Exception("Tesseract is not available.") | |
| # image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual | |
| orientation_info = {} | |
| try: | |
| orientation_info = pyocr.tesseract.detect_orientation(image) | |
| except pyocr.PyocrException as e: | |
| print("Orientation detection failed: {}".format(e)) | |
| # output = pytesseract.image_to_osd( | |
| # image, config=" --psm 0", output_type=pytesseract.Output.DICT | |
| # ) | |
| angle = orientation_info.get("angle", 0) | |
| confidence = orientation_info.get("confidence", 100) | |
| # rotate = output["rotate"] | |
| # confidence = output["orientation_conf"] | |
| if confidence > ROTATION_CONFIDENCE_THRESHOLD: | |
| new_image = image.rotate(angle, expand=True) | |
| else: | |
| new_image = image.copy() | |
| return new_image | |
| def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]: | |
| """Converts a PDF to an image using pdf2image. | |
| Args: | |
| pdf_bytes(bytes): The bytes of the PDF to be converted. | |
| Returns: A list of pillow images corresponding to each page from the PDF. | |
| """ | |
| images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI) | |
| return images | |
| def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path: | |
| """Converts a PDF to an image using ImageMagick. | |
| Args: | |
| filename(pathlib.Path): The path to the PDF to be converted. | |
| dest_folder(pathlib.Path): The destination folder for the converted pages. Pages | |
| are saved in the folder as page.jpg or as page-01.jpg, | |
| page-02.jpg, etc. | |
| Returns: dest_folder | |
| """ | |
| os.system(f"magick convert" | |
| f"-density {PDF_CONVERSION_DPI}" | |
| f"{filename}" | |
| f"-quality 100" | |
| f"{dest_folder/'page.jpg'}") | |
| return dest_folder | |
| def preprocess_image(image: Image.Image) -> Image.Image: | |
| """Preprocesses an image for future use with OCR. | |
| The following operations are performed: | |
| 1. Orientation correction | |
| Args: | |
| image(PIL.Image.Image): The image to be preprocessed. | |
| Returns: The preprocessed pillow image. | |
| """ | |
| rotated_image = correct_orientation(image) | |
| result = rotated_image | |
| image.close() | |
| return result | |
| def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]: | |
| """Preprocesses a PDF for future use with OCR. | |
| The following operations are performed: | |
| 1. PDF to image conversion | |
| 2. Orientation correction | |
| Args: | |
| pdf_bytes(bytes): The bytes of the PDF to be preprocessed. | |
| Returns: A list of pillow images corresponding to each page from the PDF. | |
| """ | |
| images = convert_pdf_to_image_pdf2image(pdf_bytes) | |
| result = [] | |
| for image in images: | |
| new_image = preprocess_image(image) | |
| image.close() | |
| result.append(new_image) | |
| return result | |
| def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]: | |
| """Preprocesses a PDF for future use with OCR. | |
| The following operations are performed: | |
| 1. PDF to image conversion | |
| 2. Orientation correction | |
| Args: | |
| filename(pathlib.Path): The path to the PDF to be preprocessed. | |
| Returns: A list of pillow images corresponding to each page from the PDF. | |
| """ | |
| dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder) | |
| result = [] | |
| for image in dest_folder.glob("*.jpg"): | |
| new_image = preprocess_image(image) | |
| image.close() | |
| result.append(new_image) | |
| return result | |
| if __name__ == '__main__': | |
| filename = 'examples/upright.jpeg' | |
| image = Image.open(filename) | |
| new_image = preprocess_image(image) | |
| image.close() | |
| new_image.show() | |
| new_image.close() | |
| filename = 'examples/rotated.pdf' | |
| with open(filename, 'rb') as file: | |
| bytes_ = bytes(file.read()) | |
| images = preprocess_pdf_pdf2image(bytes_) | |
| for image in images: | |
| image.show() | |
| image.close() |