Spaces:
Sleeping
Sleeping
| from src.utils.functions import cleanText, getConfig | |
| from concurrent.futures import ThreadPoolExecutor | |
| from src.utils.exceptions import CustomException | |
| from pdf2image import convert_from_path | |
| from src.utils.logging import logger | |
| import numpy as np | |
| import pymupdf | |
| import easyocr | |
| class PdfLoader: | |
| def __init__(self) -> None: | |
| self.config = getConfig(path = "config.ini") | |
| self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu")) | |
| def extractTextFromPage(self, page): | |
| return cleanText(text = page.get_text()) | |
| def searchablePdf(self, pdfPath: str): | |
| try: | |
| logger.info("Text Extraction Started from Searchable PDF") | |
| doc = pymupdf.open(pdfPath) | |
| pages = [doc.load_page(i) for i in range(len(doc))] | |
| with ThreadPoolExecutor() as executor: | |
| texts = list(executor.map(self.extractTextFromPage, pages)) | |
| doc.close() | |
| return "\n".join(texts) | |
| except Exception as e: | |
| logger.error(CustomException(e)) | |
| def getText(self, image): | |
| text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)]) | |
| return cleanText(text = text) | |
| def scannablePdf(self, pdfPath: str): | |
| try: | |
| logger.info("Text Extraction Started from Scannable PDF") | |
| allImages = convert_from_path(pdfPath) | |
| texts = [self.getText(image) for image in allImages] | |
| return "\n".join(texts) | |
| except Exception as e: | |
| logger.error(CustomException(e)) |