Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import logging | |
| from PDF_highlight_extractor import extract_highlights, clean_chinese_text | |
| from docx import Document | |
| import urllib.parse | |
| import tempfile | |
| import shutil | |
| import re | |
| import traceback | |
| import fitz # PyMuPDF | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def sanitize_filename(filename): | |
| """Sanitize filename for web use while preserving Chinese characters.""" | |
| try: | |
| filename = urllib.parse.unquote(filename) | |
| filename = os.path.basename(filename) | |
| # Use a safer regex for filename sanitization | |
| filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename) | |
| name, ext = os.path.splitext(filename) | |
| if len(name) > 100: | |
| name = name[:100] | |
| return name + ext | |
| except Exception as e: | |
| logger.warning(f"Error sanitizing filename: {str(e)}") | |
| # Fallback to a simple safe name | |
| return f"upload_{hash(str(filename))}.pdf" | |
| st.set_page_config( | |
| page_title="PDF Highlight Extractor", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| st.title("π PDF Highlight Extractor") | |
| st.markdown("Upload a PDF file to extract and categorize highlights.") | |
| # Display PyMuPDF version for debugging | |
| st.sidebar.text(f"PyMuPDF version: {fitz.__version__}") | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| try: | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| # Add file size check | |
| file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB | |
| if file_size > 200: # Streamlit Cloud's default limit | |
| st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.") | |
| st.stop() | |
| original_filename = uploaded_file.name | |
| logger.info(f"Processing file: {original_filename}") | |
| safe_filename = sanitize_filename(original_filename) | |
| logger.info(f"Sanitized filename: {safe_filename}") | |
| file_extension = os.path.splitext(safe_filename)[1] | |
| temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}") | |
| with open(temp_file_path, "wb") as f: | |
| f.write(uploaded_file.getvalue()) | |
| logger.info(f"Saved to temp file: {temp_file_path}") | |
| try: | |
| with st.spinner("Extracting highlights..."): | |
| # Test if we can open the PDF first | |
| try: | |
| test_pdf = fitz.open(temp_file_path) | |
| logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages") | |
| test_pdf.close() | |
| except Exception as pdf_error: | |
| logger.error(f"Failed to open PDF: {str(pdf_error)}") | |
| st.error(f"Failed to open PDF: {str(pdf_error)}") | |
| raise | |
| # Extract highlights | |
| highlights = extract_highlights(temp_file_path) | |
| logger.info(f"Extracted {len(highlights)} highlights") | |
| if highlights: | |
| st.success(f"Found {len(highlights)} highlights!") | |
| highlights_by_category = {} | |
| for highlight in highlights: | |
| category = highlight['category'] | |
| if category not in highlights_by_category: | |
| highlights_by_category[category] = [] | |
| highlights_by_category[category].append(highlight) | |
| for category, category_highlights in highlights_by_category.items(): | |
| with st.expander(f"π {category} ({len(category_highlights)} highlights)"): | |
| for highlight in category_highlights: | |
| st.markdown(f"**Page {highlight['page']}**") | |
| try: | |
| st.markdown(highlight['text']) | |
| except Exception as text_error: | |
| clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8') | |
| st.text(clean_text) # Fallback to plain text | |
| if highlight['comment']: | |
| try: | |
| st.markdown(f"> {highlight['comment']}") | |
| except Exception: | |
| st.text(f"Comment: {highlight['comment']}") | |
| st.markdown("---") | |
| file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"]) | |
| if file_format == "Markdown": | |
| markdown_content = "# PDF Highlights Summary\n\n" | |
| markdown_content += f"Total highlights found: {len(highlights)}\n\n" | |
| for category, category_highlights in highlights_by_category.items(): | |
| markdown_content += f"## {category}\n\n" | |
| for highlight in category_highlights: | |
| markdown_content += f"### Page {highlight['page']}\n\n" | |
| markdown_content += f"{highlight['text']}\n\n" | |
| if highlight['comment']: | |
| markdown_content += f"> {highlight['comment']}\n\n" | |
| markdown_content += "---\n\n" | |
| download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md" | |
| st.download_button( | |
| label="Download Highlights Summary (Markdown)", | |
| data=markdown_content.encode('utf-8'), # Ensure proper encoding | |
| file_name=download_filename, | |
| mime="text/markdown" | |
| ) | |
| elif file_format == "Txt": | |
| text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n" | |
| for category, category_highlights in highlights_by_category.items(): | |
| text_content += f"{category}\n\n" | |
| for highlight in category_highlights: | |
| text_content += f"Page {highlight['page']}: {highlight['text']}\n" | |
| if highlight['comment']: | |
| text_content += f"Comment: {highlight['comment']}\n" | |
| text_content += "---\n\n" | |
| download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt" | |
| st.download_button( | |
| label="Download Highlights Summary (Text)", | |
| data=text_content.encode('utf-8'), # Ensure proper encoding | |
| file_name=download_filename, | |
| mime="text/plain" | |
| ) | |
| elif file_format == "Word": | |
| doc = Document() | |
| doc.add_heading('PDF Highlights Summary', level=1) | |
| doc.add_paragraph(f'Total highlights found: {len(highlights)}') | |
| for category, category_highlights in highlights_by_category.items(): | |
| doc.add_heading(category, level=2) | |
| for highlight in category_highlights: | |
| doc.add_heading(f'Page {highlight["page"]}', level=3) | |
| doc.add_paragraph(highlight['text']) | |
| if highlight['comment']: | |
| doc.add_paragraph(f'Comment: {highlight["comment"]}') | |
| doc.add_paragraph('---') | |
| word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx") | |
| doc.save(word_file_path) | |
| with open(word_file_path, "rb") as f: | |
| docx_bytes = f.read() | |
| st.download_button( | |
| label="Download Highlights Summary (Word)", | |
| data=docx_bytes, | |
| file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx", | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ) | |
| else: | |
| st.warning("No highlights found in the PDF.") | |
| except Exception as e: | |
| logger.error(f"Error processing PDF: {str(e)}") | |
| logger.error(traceback.format_exc()) | |
| st.error(f"Error processing PDF: {str(e)}") | |
| finally: | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| except Exception as e: | |
| logger.error(f"Error processing file: {str(e)}") | |
| logger.error(traceback.format_exc()) | |
| st.error(f"Error processing file: {str(e)}") | |
| st.sidebar.markdown(""" | |
| ### About | |
| This app extracts and categorizes highlights from PDF files based on their colors: | |
| - π‘ Light Blue: Ideas & Insights | |
| - π Yellow: General Notes | |
| - β Green: Action Items / To-Do | |
| - π Pink: Quotes & References | |
| - β οΈ Red: Critical Issues / Warnings | |
| """) |