Spaces:
Sleeping
Sleeping
| from ..components.extractPdfDetails import ExtractPdfDetails | |
| from ..components.summaryEngine import SummaryEngine | |
| from concurrent.futures import ThreadPoolExecutor | |
| from ..utils.logger import logger | |
| class Pipeline: | |
| def __init__(self): | |
| logger.info("INITIALIZING PIPELINE") | |
| self.extractPdfDetails = ExtractPdfDetails() | |
| self.summaryEngine = SummaryEngine() | |
| def run(self, pdfBytesList: list[bytes]) -> str: | |
| """ | |
| Run the pipeline on multiple PDF files | |
| Args: | |
| pdfBytesList: list of bytes of multiple pdf files | |
| Returns: | |
| summary: combined summary of all pdf files | |
| """ | |
| try: | |
| logger.info("Running the pipeline for multiple PDFs") | |
| allImages = [] | |
| for pdfBytes in pdfBytesList: | |
| images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes) | |
| allImages.extend(images) | |
| chunks = self.extractPdfDetails.chunkImages(images=allImages) | |
| with ThreadPoolExecutor(max_workers=30) as executor: | |
| futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks] | |
| summaries = [future.result() for future in futures] | |
| summary = self.summaryEngine.summarize(texts=summaries) | |
| return summary | |
| except Exception as e: | |
| logger.exception(f"Error running the pipeline: {e}") | |
| return None |