fincatch-ocr / src /streamlit_app.py
gnlui's picture
initial
0bad002
import streamlit as st
import os
import logging
from PDF_highlight_extractor import extract_highlights, clean_chinese_text
from docx import Document
import urllib.parse
import tempfile
import shutil
import re
import traceback
import fitz # PyMuPDF
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def sanitize_filename(filename):
"""Sanitize filename for web use while preserving Chinese characters."""
try:
filename = urllib.parse.unquote(filename)
filename = os.path.basename(filename)
# Use a safer regex for filename sanitization
filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
name, ext = os.path.splitext(filename)
if len(name) > 100:
name = name[:100]
return name + ext
except Exception as e:
logger.warning(f"Error sanitizing filename: {str(e)}")
# Fallback to a simple safe name
return f"upload_{hash(str(filename))}.pdf"
st.set_page_config(
page_title="PDF Highlight Extractor",
page_icon="πŸ“„",
layout="wide"
)
st.title("πŸ“„ PDF Highlight Extractor")
st.markdown("Upload a PDF file to extract and categorize highlights.")
# Display PyMuPDF version for debugging
st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
try:
temp_dir = tempfile.mkdtemp()
try:
# Add file size check
file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB
if file_size > 200: # Streamlit Cloud's default limit
st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
st.stop()
original_filename = uploaded_file.name
logger.info(f"Processing file: {original_filename}")
safe_filename = sanitize_filename(original_filename)
logger.info(f"Sanitized filename: {safe_filename}")
file_extension = os.path.splitext(safe_filename)[1]
temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getvalue())
logger.info(f"Saved to temp file: {temp_file_path}")
try:
with st.spinner("Extracting highlights..."):
# Test if we can open the PDF first
try:
test_pdf = fitz.open(temp_file_path)
logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
test_pdf.close()
except Exception as pdf_error:
logger.error(f"Failed to open PDF: {str(pdf_error)}")
st.error(f"Failed to open PDF: {str(pdf_error)}")
raise
# Extract highlights
highlights = extract_highlights(temp_file_path)
logger.info(f"Extracted {len(highlights)} highlights")
if highlights:
st.success(f"Found {len(highlights)} highlights!")
highlights_by_category = {}
for highlight in highlights:
category = highlight['category']
if category not in highlights_by_category:
highlights_by_category[category] = []
highlights_by_category[category].append(highlight)
for category, category_highlights in highlights_by_category.items():
with st.expander(f"πŸ“Œ {category} ({len(category_highlights)} highlights)"):
for highlight in category_highlights:
st.markdown(f"**Page {highlight['page']}**")
try:
st.markdown(highlight['text'])
except Exception as text_error:
clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
st.text(clean_text) # Fallback to plain text
if highlight['comment']:
try:
st.markdown(f"> {highlight['comment']}")
except Exception:
st.text(f"Comment: {highlight['comment']}")
st.markdown("---")
file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])
if file_format == "Markdown":
markdown_content = "# PDF Highlights Summary\n\n"
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
for category, category_highlights in highlights_by_category.items():
markdown_content += f"## {category}\n\n"
for highlight in category_highlights:
markdown_content += f"### Page {highlight['page']}\n\n"
markdown_content += f"{highlight['text']}\n\n"
if highlight['comment']:
markdown_content += f"> {highlight['comment']}\n\n"
markdown_content += "---\n\n"
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"
st.download_button(
label="Download Highlights Summary (Markdown)",
data=markdown_content.encode('utf-8'), # Ensure proper encoding
file_name=download_filename,
mime="text/markdown"
)
elif file_format == "Txt":
text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
for category, category_highlights in highlights_by_category.items():
text_content += f"{category}\n\n"
for highlight in category_highlights:
text_content += f"Page {highlight['page']}: {highlight['text']}\n"
if highlight['comment']:
text_content += f"Comment: {highlight['comment']}\n"
text_content += "---\n\n"
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"
st.download_button(
label="Download Highlights Summary (Text)",
data=text_content.encode('utf-8'), # Ensure proper encoding
file_name=download_filename,
mime="text/plain"
)
elif file_format == "Word":
doc = Document()
doc.add_heading('PDF Highlights Summary', level=1)
doc.add_paragraph(f'Total highlights found: {len(highlights)}')
for category, category_highlights in highlights_by_category.items():
doc.add_heading(category, level=2)
for highlight in category_highlights:
doc.add_heading(f'Page {highlight["page"]}', level=3)
doc.add_paragraph(highlight['text'])
if highlight['comment']:
doc.add_paragraph(f'Comment: {highlight["comment"]}')
doc.add_paragraph('---')
word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
doc.save(word_file_path)
with open(word_file_path, "rb") as f:
docx_bytes = f.read()
st.download_button(
label="Download Highlights Summary (Word)",
data=docx_bytes,
file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
else:
st.warning("No highlights found in the PDF.")
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
logger.error(traceback.format_exc())
st.error(f"Error processing PDF: {str(e)}")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as e:
logger.error(f"Error processing file: {str(e)}")
logger.error(traceback.format_exc())
st.error(f"Error processing file: {str(e)}")
st.sidebar.markdown("""
### About
This app extracts and categorizes highlights from PDF files based on their colors:
- πŸ’‘ Light Blue: Ideas & Insights
- πŸ“ Yellow: General Notes
- βœ… Green: Action Items / To-Do
- πŸ“– Pink: Quotes & References
- ⚠️ Red: Critical Issues / Warnings
""")