import streamlit as st import os import logging from PDF_highlight_extractor import extract_highlights, clean_chinese_text from docx import Document import urllib.parse import tempfile import shutil import re import traceback import fitz # PyMuPDF # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def sanitize_filename(filename): """Sanitize filename for web use while preserving Chinese characters.""" try: filename = urllib.parse.unquote(filename) filename = os.path.basename(filename) # Use a safer regex for filename sanitization filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename) name, ext = os.path.splitext(filename) if len(name) > 100: name = name[:100] return name + ext except Exception as e: logger.warning(f"Error sanitizing filename: {str(e)}") # Fallback to a simple safe name return f"upload_{hash(str(filename))}.pdf" st.set_page_config( page_title="PDF Highlight Extractor", page_icon="📄", layout="wide" ) st.title("📄 PDF Highlight Extractor") st.markdown("Upload a PDF file to extract and categorize highlights.") # Display PyMuPDF version for debugging st.sidebar.text(f"PyMuPDF version: {fitz.__version__}") uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: try: temp_dir = tempfile.mkdtemp() try: # Add file size check file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB if file_size > 200: # Streamlit Cloud's default limit st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.") st.stop() original_filename = uploaded_file.name logger.info(f"Processing file: {original_filename}") safe_filename = sanitize_filename(original_filename) logger.info(f"Sanitized filename: {safe_filename}") file_extension = os.path.splitext(safe_filename)[1] temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}") with open(temp_file_path, "wb") as f: f.write(uploaded_file.getvalue()) logger.info(f"Saved to temp file: {temp_file_path}") try: with st.spinner("Extracting highlights..."): # Test if we can open the PDF first try: test_pdf = fitz.open(temp_file_path) logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages") test_pdf.close() except Exception as pdf_error: logger.error(f"Failed to open PDF: {str(pdf_error)}") st.error(f"Failed to open PDF: {str(pdf_error)}") raise # Extract highlights highlights = extract_highlights(temp_file_path) logger.info(f"Extracted {len(highlights)} highlights") if highlights: st.success(f"Found {len(highlights)} highlights!") highlights_by_category = {} for highlight in highlights: category = highlight['category'] if category not in highlights_by_category: highlights_by_category[category] = [] highlights_by_category[category].append(highlight) for category, category_highlights in highlights_by_category.items(): with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"): for highlight in category_highlights: st.markdown(f"**Page {highlight['page']}**") try: st.markdown(highlight['text']) except Exception as text_error: clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8') st.text(clean_text) # Fallback to plain text if highlight['comment']: try: st.markdown(f"> {highlight['comment']}") except Exception: st.text(f"Comment: {highlight['comment']}") st.markdown("---") file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"]) if file_format == "Markdown": markdown_content = "# PDF Highlights Summary\n\n" markdown_content += f"Total highlights found: {len(highlights)}\n\n" for category, category_highlights in highlights_by_category.items(): markdown_content += f"## {category}\n\n" for highlight in category_highlights: markdown_content += f"### Page {highlight['page']}\n\n" markdown_content += f"{highlight['text']}\n\n" if highlight['comment']: markdown_content += f"> {highlight['comment']}\n\n" markdown_content += "---\n\n" download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md" st.download_button( label="Download Highlights Summary (Markdown)", data=markdown_content.encode('utf-8'), # Ensure proper encoding file_name=download_filename, mime="text/markdown" ) elif file_format == "Txt": text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n" for category, category_highlights in highlights_by_category.items(): text_content += f"{category}\n\n" for highlight in category_highlights: text_content += f"Page {highlight['page']}: {highlight['text']}\n" if highlight['comment']: text_content += f"Comment: {highlight['comment']}\n" text_content += "---\n\n" download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt" st.download_button( label="Download Highlights Summary (Text)", data=text_content.encode('utf-8'), # Ensure proper encoding file_name=download_filename, mime="text/plain" ) elif file_format == "Word": doc = Document() doc.add_heading('PDF Highlights Summary', level=1) doc.add_paragraph(f'Total highlights found: {len(highlights)}') for category, category_highlights in highlights_by_category.items(): doc.add_heading(category, level=2) for highlight in category_highlights: doc.add_heading(f'Page {highlight["page"]}', level=3) doc.add_paragraph(highlight['text']) if highlight['comment']: doc.add_paragraph(f'Comment: {highlight["comment"]}') doc.add_paragraph('---') word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx") doc.save(word_file_path) with open(word_file_path, "rb") as f: docx_bytes = f.read() st.download_button( label="Download Highlights Summary (Word)", data=docx_bytes, file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) else: st.warning("No highlights found in the PDF.") except Exception as e: logger.error(f"Error processing PDF: {str(e)}") logger.error(traceback.format_exc()) st.error(f"Error processing PDF: {str(e)}") finally: shutil.rmtree(temp_dir, ignore_errors=True) except Exception as e: logger.error(f"Error processing file: {str(e)}") logger.error(traceback.format_exc()) st.error(f"Error processing file: {str(e)}") st.sidebar.markdown(""" ### About This app extracts and categorizes highlights from PDF files based on their colors: - 💡 Light Blue: Ideas & Insights - 📝 Yellow: General Notes - ✅ Green: Action Items / To-Do - 📖 Pink: Quotes & References - ⚠️ Red: Critical Issues / Warnings """)