Spaces:

techconsptr
/

ConversAI

Sleeping

File size: 1,635 Bytes

7e24b41

from src.utils.functions import cleanText, getConfig
from concurrent.futures import ThreadPoolExecutor
from src.utils.exceptions import CustomException
from pdf2image import convert_from_path
from src.utils.logging import logger
import numpy as np
import pymupdf
import easyocr


class PdfLoader:
    def __init__(self) -> None:
        self.config = getConfig(path = "config.ini")
        self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu"))
    
    def extractTextFromPage(self, page):
        return cleanText(text = page.get_text())

    def searchablePdf(self, pdfPath: str):
        try: 
            logger.info("Text Extraction Started from Searchable PDF")
            doc = pymupdf.open(pdfPath)
            pages = [doc.load_page(i) for i in range(len(doc))]
            with ThreadPoolExecutor() as executor:
                texts = list(executor.map(self.extractTextFromPage, pages))
            doc.close()
            return "\n".join(texts)
        except Exception as e:
            logger.error(CustomException(e))
    
    def getText(self, image):
        text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
        return cleanText(text = text)

    def scannablePdf(self, pdfPath: str):
        try:
            logger.info("Text Extraction Started from Scannable PDF")
            allImages = convert_from_path(pdfPath)
            texts = [self.getText(image) for image in allImages]
            return "\n".join(texts)
        except Exception as e:
            logger.error(CustomException(e))