from ..components.extractPdfDetails import ExtractPdfDetails from ..components.summaryEngine import SummaryEngine from concurrent.futures import ThreadPoolExecutor from ..utils.logger import logger class Pipeline: def __init__(self): logger.info("INITIALIZING PIPELINE") self.extractPdfDetails = ExtractPdfDetails() self.summaryEngine = SummaryEngine() def run(self, pdfBytesList: list[bytes]) -> str: """ Run the pipeline on multiple PDF files Args: pdfBytesList: list of bytes of multiple pdf files Returns: summary: combined summary of all pdf files """ try: logger.info("Running the pipeline for multiple PDFs") allImages = [] for pdfBytes in pdfBytesList: images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes) allImages.extend(images) chunks = self.extractPdfDetails.chunkImages(images=allImages) with ThreadPoolExecutor(max_workers=30) as executor: futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks] summaries = [future.result() for future in futures] summary = self.summaryEngine.summarize(texts=summaries) return summary except Exception as e: logger.exception(f"Error running the pipeline: {e}") return None