Rauhan commited on
Commit
50cbe29
Β·
1 Parent(s): ad329be

UPDATE: multiple pdfs

Browse files
main.py CHANGED
@@ -81,8 +81,8 @@ def getDejaVuFontPath():
81
 
82
  # Cache summary generation
83
  @st.cache_data(show_spinner=False, ttl=3600)
84
- def generateSummary(_pipeline, pdfBytes):
85
- return pipeline.run(pdfBytes)
86
 
87
  # Cache PDF generation
88
  @st.cache_data(show_spinner=False, ttl=3600)
@@ -116,18 +116,19 @@ def generatePdfBytes(summary, fontPath):
116
 
117
  # Sidebar
118
  with st.sidebar:
119
- st.markdown("## πŸ“„ Upload PDF")
120
- uploadedFile = st.file_uploader("Drop your PDF here", type=["pdf"])
121
-
122
- if uploadedFile:
123
- st.markdown("### πŸ” File Info")
124
- pdfDetails = {
125
- "πŸ“ File Name": uploadedFile.name,
126
- "πŸ“¦ Size": f"{round(len(uploadedFile.getvalue()) / 1024, 2)} KB",
127
- "⏰ Uploaded": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
128
- }
129
- for key, value in pdfDetails.items():
130
- st.write(f"**{key}**: {value}")
 
131
 
132
  st.markdown("---")
133
  st.markdown("""
@@ -138,12 +139,11 @@ with st.sidebar:
138
  </div>
139
  """, unsafe_allow_html=True)
140
 
141
-
142
  # Main content
143
  st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
144
- st.write("Upload a PDF to instantly receive a professional-grade analytical summary.")
145
 
146
- if uploadedFile:
147
  statusContainer = st.empty()
148
  summaryContainer = st.empty()
149
 
@@ -153,17 +153,17 @@ if uploadedFile:
153
 
154
  try:
155
  startTime = time.time()
156
- statusBox.info("πŸ“˜ Reading PDF file...")
157
- pdfBytes = uploadedFile.getvalue()
158
  readDuration = time.time() - startTime
159
- statusBox.success(f"βœ… PDF file read successfully ({readDuration:.2f}s)")
160
 
161
- statusBox.info("🧠 Generating summary...")
162
- summary = generateSummary(pipeline, pdfBytes)
163
  totalTime = time.time() - startTime
164
 
165
  if summary:
166
- statusBox.success(f"βœ… Summary generated successfully (Total time: {totalTime:.2f}s)")
167
 
168
  with summaryContainer.container():
169
  st.markdown("<h2 class='summary-header'>πŸ“Š Generated Summary</h2>", unsafe_allow_html=True)
@@ -177,7 +177,7 @@ if uploadedFile:
177
  st.download_button(
178
  label="⬇️ Download Summary as PDF",
179
  data=pdfBytesOut,
180
- file_name=f"summary_{timestamp}.pdf",
181
  mime="application/pdf"
182
  )
183
  except Exception as e:
@@ -185,6 +185,6 @@ if uploadedFile:
185
  else:
186
  statusBox.error("❌ Failed to generate summary. Please try again.")
187
  except Exception as e:
188
- statusBox.error(f"❌ Error processing PDF: {str(e)}")
189
  else:
190
- st.info("πŸš€ Please upload a PDF file using the sidebar to get started.")
 
81
 
82
  # Cache summary generation
83
  @st.cache_data(show_spinner=False, ttl=3600)
84
+ def generateSummary(_pipeline, pdfBytesList):
85
+ return pipeline.run(pdfBytesList)
86
 
87
  # Cache PDF generation
88
  @st.cache_data(show_spinner=False, ttl=3600)
 
116
 
117
  # Sidebar
118
  with st.sidebar:
119
+ st.markdown("## πŸ“„ Upload PDFs")
120
+ uploadedFiles = st.file_uploader("Drop your PDFs here", type=["pdf"], accept_multiple_files=True)
121
+
122
+ if uploadedFiles:
123
+ st.markdown("### πŸ” Files Info")
124
+ total_size = 0
125
+ for file in uploadedFiles:
126
+ file_size = len(file.getvalue()) / 1024
127
+ total_size += file_size
128
+ st.write(f"**πŸ“ {file.name}**: {file_size:.2f} KB")
129
+
130
+ st.write(f"**πŸ“¦ Total Size**: {total_size:.2f} KB")
131
+ st.write(f"**⏰ Uploaded**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
132
 
133
  st.markdown("---")
134
  st.markdown("""
 
139
  </div>
140
  """, unsafe_allow_html=True)
141
 
 
142
  # Main content
143
  st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
144
+ st.write("Upload your PDFs to instantly receive a professional-grade analytical summary.")
145
 
146
+ if uploadedFiles:
147
  statusContainer = st.empty()
148
  summaryContainer = st.empty()
149
 
 
153
 
154
  try:
155
  startTime = time.time()
156
+ statusBox.info("πŸ“˜ Reading PDF files...")
157
+ pdfBytesList = [file.getvalue() for file in uploadedFiles]
158
  readDuration = time.time() - startTime
159
+ statusBox.success(f"βœ… {len(uploadedFiles)} PDF files read successfully ({readDuration:.2f}s)")
160
 
161
+ statusBox.info("🧠 Generating combined summary...")
162
+ summary = generateSummary(pipeline, pdfBytesList)
163
  totalTime = time.time() - startTime
164
 
165
  if summary:
166
+ statusBox.success(f"βœ… Combined summary generated successfully (Total time: {totalTime:.2f}s)")
167
 
168
  with summaryContainer.container():
169
  st.markdown("<h2 class='summary-header'>πŸ“Š Generated Summary</h2>", unsafe_allow_html=True)
 
177
  st.download_button(
178
  label="⬇️ Download Summary as PDF",
179
  data=pdfBytesOut,
180
+ file_name=f"combined_summary_{timestamp}.pdf",
181
  mime="application/pdf"
182
  )
183
  except Exception as e:
 
185
  else:
186
  statusBox.error("❌ Failed to generate summary. Please try again.")
187
  except Exception as e:
188
+ statusBox.error(f"❌ Error processing PDFs: {str(e)}")
189
  else:
190
+ st.info("πŸš€ Please upload your PDF files using the sidebar to get started.")
src/components/extractPdfDetails.py CHANGED
@@ -15,7 +15,7 @@ class ExtractPdfDetails:
15
  self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
16
  self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
17
  self.llmClient = OpenAI(
18
- base_url = self.config["GROQ CONFIG"]["BASEURL"],
19
  api_key = os.environ["GROQ_API_KEY"]
20
  )
21
 
 
15
  self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
16
  self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
17
  self.llmClient = OpenAI(
18
+ base_url = self.config.get("GROQ CONFIG", "BASEURL"),
19
  api_key = os.environ["GROQ_API_KEY"]
20
  )
21
 
src/pipelines/pipeline.py CHANGED
@@ -9,22 +9,25 @@ class Pipeline:
9
  self.extractPdfDetails = ExtractPdfDetails()
10
  self.summaryEngine = SummaryEngine()
11
 
12
- def run(self, pdfBytes: bytes) -> str:
13
  """
14
- Run the pipeline
15
  Args:
16
- pdfBytes: bytes of the pdf file
17
  Returns:
18
- summary: summary of the pdf file
19
  """
20
  try:
21
- logger.info("Running the pipeline")
22
- images = self.extractPdfDetails.convertToImages(pdfBytes = pdfBytes)
23
- chunks = self.extractPdfDetails.chunkImages(images = images)
24
- with ThreadPoolExecutor(max_workers = 30) as executor:
 
 
 
25
  futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
26
  summaries = [future.result() for future in futures]
27
- summary = self.summaryEngine.summarize(texts = summaries)
28
  return summary
29
  except Exception as e:
30
  logger.exception(f"Error running the pipeline: {e}")
 
9
  self.extractPdfDetails = ExtractPdfDetails()
10
  self.summaryEngine = SummaryEngine()
11
 
12
+ def run(self, pdfBytesList: list[bytes]) -> str:
13
  """
14
+ Run the pipeline on multiple PDF files
15
  Args:
16
+ pdfBytesList: list of bytes of multiple pdf files
17
  Returns:
18
+ summary: combined summary of all pdf files
19
  """
20
  try:
21
+ logger.info("Running the pipeline for multiple PDFs")
22
+ allImages = []
23
+ for pdfBytes in pdfBytesList:
24
+ images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes)
25
+ allImages.extend(images)
26
+ chunks = self.extractPdfDetails.chunkImages(images=allImages)
27
+ with ThreadPoolExecutor(max_workers=30) as executor:
28
  futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
29
  summaries = [future.result() for future in futures]
30
+ summary = self.summaryEngine.summarize(texts=summaries)
31
  return summary
32
  except Exception as e:
33
  logger.exception(f"Error running the pipeline: {e}")