Spaces:
Sleeping
Sleeping
UPDATE: multiple pdfs
Browse files- main.py +26 -26
- src/components/extractPdfDetails.py +1 -1
- src/pipelines/pipeline.py +12 -9
main.py
CHANGED
|
@@ -81,8 +81,8 @@ def getDejaVuFontPath():
|
|
| 81 |
|
| 82 |
# Cache summary generation
|
| 83 |
@st.cache_data(show_spinner=False, ttl=3600)
|
| 84 |
-
def generateSummary(_pipeline,
|
| 85 |
-
return pipeline.run(
|
| 86 |
|
| 87 |
# Cache PDF generation
|
| 88 |
@st.cache_data(show_spinner=False, ttl=3600)
|
|
@@ -116,18 +116,19 @@ def generatePdfBytes(summary, fontPath):
|
|
| 116 |
|
| 117 |
# Sidebar
|
| 118 |
with st.sidebar:
|
| 119 |
-
st.markdown("## π Upload
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
if
|
| 123 |
-
st.markdown("### π
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
st.markdown("---")
|
| 133 |
st.markdown("""
|
|
@@ -138,12 +139,11 @@ with st.sidebar:
|
|
| 138 |
</div>
|
| 139 |
""", unsafe_allow_html=True)
|
| 140 |
|
| 141 |
-
|
| 142 |
# Main content
|
| 143 |
st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
|
| 144 |
-
st.write("Upload
|
| 145 |
|
| 146 |
-
if
|
| 147 |
statusContainer = st.empty()
|
| 148 |
summaryContainer = st.empty()
|
| 149 |
|
|
@@ -153,17 +153,17 @@ if uploadedFile:
|
|
| 153 |
|
| 154 |
try:
|
| 155 |
startTime = time.time()
|
| 156 |
-
statusBox.info("π Reading PDF
|
| 157 |
-
|
| 158 |
readDuration = time.time() - startTime
|
| 159 |
-
statusBox.success(f"β
PDF
|
| 160 |
|
| 161 |
-
statusBox.info("π§ Generating summary...")
|
| 162 |
-
summary = generateSummary(pipeline,
|
| 163 |
totalTime = time.time() - startTime
|
| 164 |
|
| 165 |
if summary:
|
| 166 |
-
statusBox.success(f"β
|
| 167 |
|
| 168 |
with summaryContainer.container():
|
| 169 |
st.markdown("<h2 class='summary-header'>π Generated Summary</h2>", unsafe_allow_html=True)
|
|
@@ -177,7 +177,7 @@ if uploadedFile:
|
|
| 177 |
st.download_button(
|
| 178 |
label="β¬οΈ Download Summary as PDF",
|
| 179 |
data=pdfBytesOut,
|
| 180 |
-
file_name=f"
|
| 181 |
mime="application/pdf"
|
| 182 |
)
|
| 183 |
except Exception as e:
|
|
@@ -185,6 +185,6 @@ if uploadedFile:
|
|
| 185 |
else:
|
| 186 |
statusBox.error("β Failed to generate summary. Please try again.")
|
| 187 |
except Exception as e:
|
| 188 |
-
statusBox.error(f"β Error processing
|
| 189 |
else:
|
| 190 |
-
st.info("π Please upload
|
|
|
|
| 81 |
|
| 82 |
# Cache summary generation
|
| 83 |
@st.cache_data(show_spinner=False, ttl=3600)
|
| 84 |
+
def generateSummary(_pipeline, pdfBytesList):
|
| 85 |
+
return pipeline.run(pdfBytesList)
|
| 86 |
|
| 87 |
# Cache PDF generation
|
| 88 |
@st.cache_data(show_spinner=False, ttl=3600)
|
|
|
|
| 116 |
|
| 117 |
# Sidebar
|
| 118 |
with st.sidebar:
|
| 119 |
+
st.markdown("## π Upload PDFs")
|
| 120 |
+
uploadedFiles = st.file_uploader("Drop your PDFs here", type=["pdf"], accept_multiple_files=True)
|
| 121 |
+
|
| 122 |
+
if uploadedFiles:
|
| 123 |
+
st.markdown("### π Files Info")
|
| 124 |
+
total_size = 0
|
| 125 |
+
for file in uploadedFiles:
|
| 126 |
+
file_size = len(file.getvalue()) / 1024
|
| 127 |
+
total_size += file_size
|
| 128 |
+
st.write(f"**π {file.name}**: {file_size:.2f} KB")
|
| 129 |
+
|
| 130 |
+
st.write(f"**π¦ Total Size**: {total_size:.2f} KB")
|
| 131 |
+
st.write(f"**β° Uploaded**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 132 |
|
| 133 |
st.markdown("---")
|
| 134 |
st.markdown("""
|
|
|
|
| 139 |
</div>
|
| 140 |
""", unsafe_allow_html=True)
|
| 141 |
|
|
|
|
| 142 |
# Main content
|
| 143 |
st.markdown("<h1 class='main-header'>Welcome to <span style='color:#2E7D32'>AlphaExtract</span></h1>", unsafe_allow_html=True)
|
| 144 |
+
st.write("Upload your PDFs to instantly receive a professional-grade analytical summary.")
|
| 145 |
|
| 146 |
+
if uploadedFiles:
|
| 147 |
statusContainer = st.empty()
|
| 148 |
summaryContainer = st.empty()
|
| 149 |
|
|
|
|
| 153 |
|
| 154 |
try:
|
| 155 |
startTime = time.time()
|
| 156 |
+
statusBox.info("π Reading PDF files...")
|
| 157 |
+
pdfBytesList = [file.getvalue() for file in uploadedFiles]
|
| 158 |
readDuration = time.time() - startTime
|
| 159 |
+
statusBox.success(f"β
{len(uploadedFiles)} PDF files read successfully ({readDuration:.2f}s)")
|
| 160 |
|
| 161 |
+
statusBox.info("π§ Generating combined summary...")
|
| 162 |
+
summary = generateSummary(pipeline, pdfBytesList)
|
| 163 |
totalTime = time.time() - startTime
|
| 164 |
|
| 165 |
if summary:
|
| 166 |
+
statusBox.success(f"β
Combined summary generated successfully (Total time: {totalTime:.2f}s)")
|
| 167 |
|
| 168 |
with summaryContainer.container():
|
| 169 |
st.markdown("<h2 class='summary-header'>π Generated Summary</h2>", unsafe_allow_html=True)
|
|
|
|
| 177 |
st.download_button(
|
| 178 |
label="β¬οΈ Download Summary as PDF",
|
| 179 |
data=pdfBytesOut,
|
| 180 |
+
file_name=f"combined_summary_{timestamp}.pdf",
|
| 181 |
mime="application/pdf"
|
| 182 |
)
|
| 183 |
except Exception as e:
|
|
|
|
| 185 |
else:
|
| 186 |
statusBox.error("β Failed to generate summary. Please try again.")
|
| 187 |
except Exception as e:
|
| 188 |
+
statusBox.error(f"β Error processing PDFs: {str(e)}")
|
| 189 |
else:
|
| 190 |
+
st.info("π Please upload your PDF files using the sidebar to get started.")
|
src/components/extractPdfDetails.py
CHANGED
|
@@ -15,7 +15,7 @@ class ExtractPdfDetails:
|
|
| 15 |
self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
|
| 16 |
self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
|
| 17 |
self.llmClient = OpenAI(
|
| 18 |
-
base_url = self.config
|
| 19 |
api_key = os.environ["GROQ_API_KEY"]
|
| 20 |
)
|
| 21 |
|
|
|
|
| 15 |
self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
|
| 16 |
self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
|
| 17 |
self.llmClient = OpenAI(
|
| 18 |
+
base_url = self.config.get("GROQ CONFIG", "BASEURL"),
|
| 19 |
api_key = os.environ["GROQ_API_KEY"]
|
| 20 |
)
|
| 21 |
|
src/pipelines/pipeline.py
CHANGED
|
@@ -9,22 +9,25 @@ class Pipeline:
|
|
| 9 |
self.extractPdfDetails = ExtractPdfDetails()
|
| 10 |
self.summaryEngine = SummaryEngine()
|
| 11 |
|
| 12 |
-
def run(self,
|
| 13 |
"""
|
| 14 |
-
Run the pipeline
|
| 15 |
Args:
|
| 16 |
-
|
| 17 |
Returns:
|
| 18 |
-
summary: summary of
|
| 19 |
"""
|
| 20 |
try:
|
| 21 |
-
logger.info("Running the pipeline")
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
|
| 26 |
summaries = [future.result() for future in futures]
|
| 27 |
-
summary = self.summaryEngine.summarize(texts
|
| 28 |
return summary
|
| 29 |
except Exception as e:
|
| 30 |
logger.exception(f"Error running the pipeline: {e}")
|
|
|
|
| 9 |
self.extractPdfDetails = ExtractPdfDetails()
|
| 10 |
self.summaryEngine = SummaryEngine()
|
| 11 |
|
| 12 |
+
def run(self, pdfBytesList: list[bytes]) -> str:
|
| 13 |
"""
|
| 14 |
+
Run the pipeline on multiple PDF files
|
| 15 |
Args:
|
| 16 |
+
pdfBytesList: list of bytes of multiple pdf files
|
| 17 |
Returns:
|
| 18 |
+
summary: combined summary of all pdf files
|
| 19 |
"""
|
| 20 |
try:
|
| 21 |
+
logger.info("Running the pipeline for multiple PDFs")
|
| 22 |
+
allImages = []
|
| 23 |
+
for pdfBytes in pdfBytesList:
|
| 24 |
+
images = self.extractPdfDetails.convertToImages(pdfBytes=pdfBytes)
|
| 25 |
+
allImages.extend(images)
|
| 26 |
+
chunks = self.extractPdfDetails.chunkImages(images=allImages)
|
| 27 |
+
with ThreadPoolExecutor(max_workers=30) as executor:
|
| 28 |
futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
|
| 29 |
summaries = [future.result() for future in futures]
|
| 30 |
+
summary = self.summaryEngine.summarize(texts=summaries)
|
| 31 |
return summary
|
| 32 |
except Exception as e:
|
| 33 |
logger.exception(f"Error running the pipeline: {e}")
|