Spaces:

gnlui
/

fincatch-ocr

Sleeping

App Files Files Community

fincatch-ocr / src /streamlit_app.py

gnlui

initial

0bad002 7 months ago

raw

history blame contribute delete

9.8 kB

	import streamlit as st
	import os
	import logging
	from PDF_highlight_extractor import extract_highlights, clean_chinese_text
	from docx import Document
	import urllib.parse
	import tempfile
	import shutil
	import re
	import traceback
	import fitz # PyMuPDF

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def sanitize_filename(filename):
	"""Sanitize filename for web use while preserving Chinese characters."""
	try:
	filename = urllib.parse.unquote(filename)
	filename = os.path.basename(filename)
	# Use a safer regex for filename sanitization
	filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
	name, ext = os.path.splitext(filename)
	if len(name) > 100:
	name = name[:100]
	return name + ext
	except Exception as e:
	logger.warning(f"Error sanitizing filename: {str(e)}")
	# Fallback to a simple safe name
	return f"upload_{hash(str(filename))}.pdf"

	st.set_page_config(
	page_title="PDF Highlight Extractor",
	page_icon="📄",
	layout="wide"
	)

	st.title("📄 PDF Highlight Extractor")
	st.markdown("Upload a PDF file to extract and categorize highlights.")

	# Display PyMuPDF version for debugging
	st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")

	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	try:
	temp_dir = tempfile.mkdtemp()
	try:
	# Add file size check
	file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB
	if file_size > 200: # Streamlit Cloud's default limit
	st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
	st.stop()

	original_filename = uploaded_file.name
	logger.info(f"Processing file: {original_filename}")

	safe_filename = sanitize_filename(original_filename)
	logger.info(f"Sanitized filename: {safe_filename}")

	file_extension = os.path.splitext(safe_filename)[1]
	temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")

	with open(temp_file_path, "wb") as f:
	f.write(uploaded_file.getvalue())

	logger.info(f"Saved to temp file: {temp_file_path}")

	try:
	with st.spinner("Extracting highlights..."):
	# Test if we can open the PDF first
	try:
	test_pdf = fitz.open(temp_file_path)
	logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
	test_pdf.close()
	except Exception as pdf_error:
	logger.error(f"Failed to open PDF: {str(pdf_error)}")
	st.error(f"Failed to open PDF: {str(pdf_error)}")
	raise

	# Extract highlights
	highlights = extract_highlights(temp_file_path)
	logger.info(f"Extracted {len(highlights)} highlights")

	if highlights:
	st.success(f"Found {len(highlights)} highlights!")

	highlights_by_category = {}
	for highlight in highlights:
	category = highlight['category']
	if category not in highlights_by_category:
	highlights_by_category[category] = []
	highlights_by_category[category].append(highlight)

	for category, category_highlights in highlights_by_category.items():
	with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"):
	for highlight in category_highlights:
	st.markdown(f"Page {highlight['page']}")
	try:
	st.markdown(highlight['text'])
	except Exception as text_error:
	clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
	st.text(clean_text) # Fallback to plain text

	if highlight['comment']:
	try:
	st.markdown(f"> {highlight['comment']}")
	except Exception:
	st.text(f"Comment: {highlight['comment']}")
	st.markdown("---")

	file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])

	if file_format == "Markdown":
	markdown_content = "# PDF Highlights Summary\n\n"
	markdown_content += f"Total highlights found: {len(highlights)}\n\n"

	for category, category_highlights in highlights_by_category.items():
	markdown_content += f"## {category}\n\n"
	for highlight in category_highlights:
	markdown_content += f"### Page {highlight['page']}\n\n"
	markdown_content += f"{highlight['text']}\n\n"
	if highlight['comment']:
	markdown_content += f"> {highlight['comment']}\n\n"
	markdown_content += "---\n\n"

	download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"

	st.download_button(
	label="Download Highlights Summary (Markdown)",
	data=markdown_content.encode('utf-8'), # Ensure proper encoding
	file_name=download_filename,
	mime="text/markdown"
	)

	elif file_format == "Txt":
	text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"

	for category, category_highlights in highlights_by_category.items():
	text_content += f"{category}\n\n"
	for highlight in category_highlights:
	text_content += f"Page {highlight['page']}: {highlight['text']}\n"
	if highlight['comment']:
	text_content += f"Comment: {highlight['comment']}\n"
	text_content += "---\n\n"

	download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"

	st.download_button(
	label="Download Highlights Summary (Text)",
	data=text_content.encode('utf-8'), # Ensure proper encoding
	file_name=download_filename,
	mime="text/plain"
	)

	elif file_format == "Word":
	doc = Document()
	doc.add_heading('PDF Highlights Summary', level=1)
	doc.add_paragraph(f'Total highlights found: {len(highlights)}')

	for category, category_highlights in highlights_by_category.items():
	doc.add_heading(category, level=2)
	for highlight in category_highlights:
	doc.add_heading(f'Page {highlight["page"]}', level=3)
	doc.add_paragraph(highlight['text'])
	if highlight['comment']:
	doc.add_paragraph(f'Comment: {highlight["comment"]}')
	doc.add_paragraph('---')

	word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
	doc.save(word_file_path)

	with open(word_file_path, "rb") as f:
	docx_bytes = f.read()

	st.download_button(
	label="Download Highlights Summary (Word)",
	data=docx_bytes,
	file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)
	else:
	st.warning("No highlights found in the PDF.")

	except Exception as e:
	logger.error(f"Error processing PDF: {str(e)}")
	logger.error(traceback.format_exc())
	st.error(f"Error processing PDF: {str(e)}")

	finally:
	shutil.rmtree(temp_dir, ignore_errors=True)

	except Exception as e:
	logger.error(f"Error processing file: {str(e)}")
	logger.error(traceback.format_exc())
	st.error(f"Error processing file: {str(e)}")

	st.sidebar.markdown("""
	### About
	This app extracts and categorizes highlights from PDF files based on their colors:

	- 💡 Light Blue: Ideas & Insights
	- 📝 Yellow: General Notes
	- ✅ Green: Action Items / To-Do
	- 📖 Pink: Quotes & References
	- ⚠️ Red: Critical Issues / Warnings
	""")