Rauhan's picture
UPDATE: multiple pdfs
50cbe29
from ..components.extractPdfDetails import ExtractPdfDetails
from ..components.summaryEngine import SummaryEngine
from concurrent.futures import ThreadPoolExecutor
from ..utils.logger import logger
class Pipeline:
def __init__(self):
logger.info("INITIALIZING PIPELINE")
self.extractPdfDetails = ExtractPdfDetails()
self.summaryEngine = SummaryEngine()
def run(self, pdfBytesList: list[bytes]) -> str:
"""
Run the pipeline on multiple PDF files
Args:
pdfBytesList: list of bytes of multiple pdf files
Returns:
summary: combined summary of all pdf files
"""
try:
logger.info("Running the pipeline for multiple PDFs")
allImages = []
for pdfBytes in pdfBytesList:
images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes)
allImages.extend(images)
chunks = self.extractPdfDetails.chunkImages(images=allImages)
with ThreadPoolExecutor(max_workers=30) as executor:
futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
summaries = [future.result() for future in futures]
summary = self.summaryEngine.summarize(texts=summaries)
return summary
except Exception as e:
logger.exception(f"Error running the pipeline: {e}")
return None