Spaces:

Rauhan
/

PdfSummarizer

Sleeping

App Files Files Community

Rauhan commited on May 3, 2025

Commit

50cbe29

1 Parent(s): ad329be

UPDATE: multiple pdfs

Browse files

Files changed (3) hide show

main.py +26 -26
src/components/extractPdfDetails.py +1 -1
src/pipelines/pipeline.py +12 -9

main.py CHANGED Viewed

@@ -81,8 +81,8 @@ def getDejaVuFontPath():
 # Cache summary generation
 @st.cache_data(show_spinner=False, ttl=3600)
-def generateSummary(_pipeline, pdfBytes):
-    return pipeline.run(pdfBytes)
 # Cache PDF generation
 @st.cache_data(show_spinner=False, ttl=3600)
@@ -116,18 +116,19 @@ def generatePdfBytes(summary, fontPath):
 # Sidebar
 with st.sidebar:
-    st.markdown("## 📄 Upload PDF")
-    uploadedFile = st.file_uploader("Drop your PDF here", type=["pdf"])
-    if uploadedFile:
-        st.markdown("### 🔍 File Info")
-        pdfDetails = {
-            "📁 File Name": uploadedFile.name,
-            "📦 Size": f"{round(len(uploadedFile.getvalue()) / 1024, 2)} KB",
-            "⏰ Uploaded": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        }
-        for key, value in pdfDetails.items():
-            st.write(f"**{key}**: {value}")
     st.markdown("---")
     st.markdown("""
@@ -138,12 +139,11 @@ with st.sidebar:
     </div>
     """, unsafe_allow_html=True)
 # Main content
 st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
-st.write("Upload a PDF to instantly receive a professional-grade analytical summary.")
-if uploadedFile:
     statusContainer = st.empty()
     summaryContainer = st.empty()
@@ -153,17 +153,17 @@ if uploadedFile:
         try:
             startTime = time.time()
-            statusBox.info("📘 Reading PDF file...")
-            pdfBytes = uploadedFile.getvalue()
             readDuration = time.time() - startTime
-            statusBox.success(f"✅ PDF file read successfully ({readDuration:.2f}s)")
-            statusBox.info("🧠 Generating summary...")
-            summary = generateSummary(pipeline, pdfBytes)
             totalTime = time.time() - startTime
             if summary:
-                statusBox.success(f"✅ Summary generated successfully (Total time: {totalTime:.2f}s)")
                 with summaryContainer.container():
                     st.markdown("<h2 class='summary-header'>📊 Generated Summary</h2>", unsafe_allow_html=True)
@@ -177,7 +177,7 @@ if uploadedFile:
                         st.download_button(
                             label="⬇️ Download Summary as PDF",
                             data=pdfBytesOut,
-                            file_name=f"summary_{timestamp}.pdf",
                             mime="application/pdf"
                         )
                     except Exception as e:
@@ -185,6 +185,6 @@ if uploadedFile:
             else:
                 statusBox.error("❌ Failed to generate summary. Please try again.")
         except Exception as e:
-            statusBox.error(f"❌ Error processing PDF: {str(e)}")
 else:
-    st.info("🚀 Please upload a PDF file using the sidebar to get started.")

 # Cache summary generation
 @st.cache_data(show_spinner=False, ttl=3600)
+def generateSummary(_pipeline, pdfBytesList):
+    return pipeline.run(pdfBytesList)
 # Cache PDF generation
 @st.cache_data(show_spinner=False, ttl=3600)
 # Sidebar
 with st.sidebar:
+    st.markdown("## 📄 Upload PDFs")
+    uploadedFiles = st.file_uploader("Drop your PDFs here", type=["pdf"], accept_multiple_files=True)
+    if uploadedFiles:
+        st.markdown("### 🔍 Files Info")
+        total_size = 0
+        for file in uploadedFiles:
+            file_size = len(file.getvalue()) / 1024
+            total_size += file_size
+            st.write(f"**📁 {file.name}**: {file_size:.2f} KB")
+        st.write(f"**📦 Total Size**: {total_size:.2f} KB")
+        st.write(f"**⏰ Uploaded**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
     st.markdown("---")
     st.markdown("""
     </div>
     """, unsafe_allow_html=True)
 # Main content
 st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
+st.write("Upload your PDFs to instantly receive a professional-grade analytical summary.")
+if uploadedFiles:
     statusContainer = st.empty()
     summaryContainer = st.empty()
         try:
             startTime = time.time()
+            statusBox.info("📘 Reading PDF files...")
+            pdfBytesList = [file.getvalue() for file in uploadedFiles]
             readDuration = time.time() - startTime
+            statusBox.success(f"✅ {len(uploadedFiles)} PDF files read successfully ({readDuration:.2f}s)")
+            statusBox.info("🧠 Generating combined summary...")
+            summary = generateSummary(pipeline, pdfBytesList)
             totalTime = time.time() - startTime
             if summary:
+                statusBox.success(f"✅ Combined summary generated successfully (Total time: {totalTime:.2f}s)")
                 with summaryContainer.container():
                     st.markdown("<h2 class='summary-header'>📊 Generated Summary</h2>", unsafe_allow_html=True)
                         st.download_button(
                             label="⬇️ Download Summary as PDF",
                             data=pdfBytesOut,
+                            file_name=f"combined_summary_{timestamp}.pdf",
                             mime="application/pdf"
                         )
                     except Exception as e:
             else:
                 statusBox.error("❌ Failed to generate summary. Please try again.")
         except Exception as e:
+            statusBox.error(f"❌ Error processing PDFs: {str(e)}")
 else:
+    st.info("🚀 Please upload your PDF files using the sidebar to get started.")

src/components/extractPdfDetails.py CHANGED Viewed

@@ -15,7 +15,7 @@ class ExtractPdfDetails:
         self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
         self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
         self.llmClient = OpenAI(
-            base_url = self.config["GROQ CONFIG"]["BASEURL"],
             api_key = os.environ["GROQ_API_KEY"]
         )

         self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
         self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
         self.llmClient = OpenAI(
+            base_url = self.config.get("GROQ CONFIG", "BASEURL"),
             api_key = os.environ["GROQ_API_KEY"]
         )

src/pipelines/pipeline.py CHANGED Viewed

@@ -9,22 +9,25 @@ class Pipeline:
         self.extractPdfDetails = ExtractPdfDetails()
         self.summaryEngine = SummaryEngine()
-    def run(self, pdfBytes: bytes) -> str:
         """
-        Run the pipeline
         Args:
-            pdfBytes: bytes of the pdf file
         Returns:
-            summary: summary of the pdf file
         """
         try:
-            logger.info("Running the pipeline")
-            images = self.extractPdfDetails.convertToImages(pdfBytes = pdfBytes)
-            chunks = self.extractPdfDetails.chunkImages(images = images)
-            with ThreadPoolExecutor(max_workers = 30) as executor:
                 futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
                 summaries = [future.result() for future in futures]
-            summary = self.summaryEngine.summarize(texts = summaries)
             return summary
         except Exception as e:
             logger.exception(f"Error running the pipeline: {e}")

         self.extractPdfDetails = ExtractPdfDetails()
         self.summaryEngine = SummaryEngine()
+    def run(self, pdfBytesList: list[bytes]) -> str:
         """
+        Run the pipeline on multiple PDF files
         Args:
+            pdfBytesList: list of bytes of multiple pdf files
         Returns:
+            summary: combined summary of all pdf files
         """
         try:
+            logger.info("Running the pipeline for multiple PDFs")
+            allImages = []
+            for pdfBytes in pdfBytesList:
+                images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes)
+                allImages.extend(images)
+            chunks = self.extractPdfDetails.chunkImages(images=allImages)
+            with ThreadPoolExecutor(max_workers=30) as executor:
                 futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
                 summaries = [future.result() for future in futures]
+            summary = self.summaryEngine.summarize(texts=summaries)
             return summary
         except Exception as e:
             logger.exception(f"Error running the pipeline: {e}")