Spaces:
Sleeping
Sleeping
| """Responsible for extracting text from images and PDFs using OCR engines or other modules. | |
| """ | |
| from io import BytesIO | |
| from typing import List | |
| import pyocr.tesseract | |
| import pypdf | |
| from PIL import Image | |
| def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str: | |
| """Extracts text from the given PDF file using pypdf. | |
| Args: | |
| bytes_stream (BytesIO): The PDF file to extract text from. | |
| Returns: The extracted text | |
| """ | |
| pdf_reader = pypdf.PdfReader(bytes_stream) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| text += "\n\n" | |
| return text | |
| def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str: | |
| """Extracts text from the given image using tesseract via pyocr. | |
| Args: | |
| image(PIL.Image.Image): The image to extract text from. | |
| Returns: The extracted text. | |
| """ | |
| if not pyocr.tesseract.is_available(): | |
| raise Exception("Tesseract is not available.") | |
| text = pyocr.tesseract.image_to_string(image, lang="eng") | |
| return text | |
| def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str: | |
| """Extracts text from the given images using tesseract via pyocr. | |
| Args: | |
| images(List[PIL.Image.Image]): The images to extract text from. | |
| Returns: The extracted text. | |
| """ | |
| text = "" | |
| for image in images: | |
| text += extract_text_from_image_pyocr_tesseract(image) | |
| text += "\n\n" | |
| image.close() | |
| return text | |
| if __name__ == '__main__': | |
| filename = 'examples/upright.pdf' | |
| with open(filename, 'rb') as file: | |
| bytes_stream = BytesIO(file.read()) | |
| text = extract_text_from_pdf_pypdf(bytes_stream) | |
| print(text) | |
| print("-"*25) | |
| filename = 'examples/upright.jpeg' | |
| image = Image.open(filename) | |
| text = extract_text_from_image_pyocr_tesseract(image) | |
| print(text) | |
| image.close() |