Update app.py
Browse files
app.py
CHANGED
|
@@ -17,6 +17,13 @@ import torch
|
|
| 17 |
# import logging
|
| 18 |
# logging.basicConfig(level=logging.INFO)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# ---------------------------------------------------------------------------------------
|
| 21 |
# API Configuration
|
| 22 |
# ---------------------------------------------------------------------------------------
|
|
@@ -276,19 +283,29 @@ st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
|
|
| 276 |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
|
| 277 |
|
| 278 |
if uploaded_file:
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
-
for idx, image in enumerate(images):
|
| 284 |
-
markdown_text = extract_markdown_from_image(image)
|
| 285 |
-
markdown_texts.append(markdown_text)
|
| 286 |
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
-
# Check if extraction was successful
|
| 292 |
if df.empty or df['Document_Text'].isnull().all():
|
| 293 |
st.error("No meaningful text extracted from the PDF.")
|
| 294 |
st.stop()
|
|
@@ -296,6 +313,12 @@ if uploaded_file:
|
|
| 296 |
st.markdown("### Extracted Markdown Preview")
|
| 297 |
st.write(df.head())
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
# ---------------------------------------------------------------------------------------
|
| 300 |
# User Input for Topics
|
| 301 |
# ---------------------------------------------------------------------------------------
|
|
|
|
| 17 |
# import logging
|
| 18 |
# logging.basicConfig(level=logging.INFO)
|
| 19 |
|
| 20 |
+
if 'pdf_processed' not in st.session_state:
|
| 21 |
+
st.session_state['pdf_processed'] = False
|
| 22 |
+
if 'markdown_texts' not in st.session_state:
|
| 23 |
+
st.session_state['markdown_texts'] = []
|
| 24 |
+
if 'df' not in st.session_state:
|
| 25 |
+
st.session_state['df'] = pd.DataFrame()
|
| 26 |
+
|
| 27 |
# ---------------------------------------------------------------------------------------
|
| 28 |
# API Configuration
|
| 29 |
# ---------------------------------------------------------------------------------------
|
|
|
|
| 283 |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
|
| 284 |
|
| 285 |
if uploaded_file:
|
| 286 |
+
if not st.session_state['pdf_processed']:
|
| 287 |
+
with st.spinner("Processing PDF..."):
|
| 288 |
+
images = convert_pdf_to_images(uploaded_file)
|
| 289 |
+
|
| 290 |
+
markdown_texts = []
|
| 291 |
+
for idx, image in enumerate(images):
|
| 292 |
+
markdown_text = extract_markdown_from_image(image)
|
| 293 |
+
markdown_texts.append(markdown_text)
|
| 294 |
|
| 295 |
+
df = pd.DataFrame({'Document_Text': markdown_texts})
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
+
# Save results into session state
|
| 298 |
+
st.session_state['markdown_texts'] = markdown_texts
|
| 299 |
+
st.session_state['df'] = df
|
| 300 |
+
st.session_state['pdf_processed'] = True
|
| 301 |
|
| 302 |
+
st.success("PDF processed successfully!")
|
| 303 |
+
else:
|
| 304 |
+
st.success("PDF already processed. Using cached results.")
|
| 305 |
+
|
| 306 |
+
# Use cached dataframe for further processing
|
| 307 |
+
df = st.session_state['df']
|
| 308 |
|
|
|
|
| 309 |
if df.empty or df['Document_Text'].isnull().all():
|
| 310 |
st.error("No meaningful text extracted from the PDF.")
|
| 311 |
st.stop()
|
|
|
|
| 313 |
st.markdown("### Extracted Markdown Preview")
|
| 314 |
st.write(df.head())
|
| 315 |
|
| 316 |
+
if st.button("Reset / Upload New PDF"):
|
| 317 |
+
st.session_state['pdf_processed'] = False
|
| 318 |
+
st.session_state['markdown_texts'] = []
|
| 319 |
+
st.session_state['df'] = pd.DataFrame()
|
| 320 |
+
st.experimental_rerun()
|
| 321 |
+
|
| 322 |
# ---------------------------------------------------------------------------------------
|
| 323 |
# User Input for Topics
|
| 324 |
# ---------------------------------------------------------------------------------------
|