Spaces:

gnlui
/

fincatch-ocr

Sleeping

File size: 9,802 Bytes

76b7b9f
0bad002
 
 
 
 
 
 
 
 
 
76b7b9f
0bad002

import streamlit as st
import os
import logging
from PDF_highlight_extractor import extract_highlights, clean_chinese_text
from docx import Document
import urllib.parse
import tempfile
import shutil
import re
import traceback
import fitz  # PyMuPDF

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def sanitize_filename(filename):
    """Sanitize filename for web use while preserving Chinese characters."""
    try:
        filename = urllib.parse.unquote(filename)
        filename = os.path.basename(filename)
        # Use a safer regex for filename sanitization
        filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
        name, ext = os.path.splitext(filename)
        if len(name) > 100:
            name = name[:100]
        return name + ext
    except Exception as e:
        logger.warning(f"Error sanitizing filename: {str(e)}")
        # Fallback to a simple safe name
        return f"upload_{hash(str(filename))}.pdf"

st.set_page_config(
    page_title="PDF Highlight Extractor",
    page_icon="📄",
    layout="wide"
)

st.title("📄 PDF Highlight Extractor")
st.markdown("Upload a PDF file to extract and categorize highlights.")

# Display PyMuPDF version for debugging
st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    try:
        temp_dir = tempfile.mkdtemp()
        try:
            # Add file size check
            file_size = len(uploaded_file.getvalue()) / (1024 * 1024)  # Size in MB
            if file_size > 200:  # Streamlit Cloud's default limit
                st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
                st.stop()
                
            original_filename = uploaded_file.name
            logger.info(f"Processing file: {original_filename}")
            
            safe_filename = sanitize_filename(original_filename)
            logger.info(f"Sanitized filename: {safe_filename}")
            
            file_extension = os.path.splitext(safe_filename)[1]
            temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
            
            with open(temp_file_path, "wb") as f:
                f.write(uploaded_file.getvalue())
            
            logger.info(f"Saved to temp file: {temp_file_path}")
            
            try:
                with st.spinner("Extracting highlights..."):
                    # Test if we can open the PDF first
                    try:
                        test_pdf = fitz.open(temp_file_path)
                        logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
                        test_pdf.close()
                    except Exception as pdf_error:
                        logger.error(f"Failed to open PDF: {str(pdf_error)}")
                        st.error(f"Failed to open PDF: {str(pdf_error)}")
                        raise

                    # Extract highlights
                    highlights = extract_highlights(temp_file_path)
                    logger.info(f"Extracted {len(highlights)} highlights")
                
                if highlights:
                    st.success(f"Found {len(highlights)} highlights!")
                    
                    highlights_by_category = {}
                    for highlight in highlights:
                        category = highlight['category']
                        if category not in highlights_by_category:
                            highlights_by_category[category] = []
                        highlights_by_category[category].append(highlight)
                    
                    for category, category_highlights in highlights_by_category.items():
                        with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"):
                            for highlight in category_highlights:
                                st.markdown(f"**Page {highlight['page']}**")
                                try:
                                    st.markdown(highlight['text'])
                                except Exception as text_error:
                                    clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
                                    st.text(clean_text)  # Fallback to plain text
                                
                                if highlight['comment']:
                                    try:
                                        st.markdown(f"> {highlight['comment']}")
                                    except Exception:
                                        st.text(f"Comment: {highlight['comment']}")
                                st.markdown("---")
                    
                    file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])

                    if file_format == "Markdown":
                        markdown_content = "# PDF Highlights Summary\n\n"
                        markdown_content += f"Total highlights found: {len(highlights)}\n\n"
                        
                        for category, category_highlights in highlights_by_category.items():
                            markdown_content += f"## {category}\n\n"
                            for highlight in category_highlights:
                                markdown_content += f"### Page {highlight['page']}\n\n"
                                markdown_content += f"{highlight['text']}\n\n"
                                if highlight['comment']:
                                    markdown_content += f"> {highlight['comment']}\n\n"
                            markdown_content += "---\n\n"

                        download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"

                        st.download_button(
                            label="Download Highlights Summary (Markdown)",
                            data=markdown_content.encode('utf-8'),  # Ensure proper encoding
                            file_name=download_filename,
                            mime="text/markdown"
                        )

                    elif file_format == "Txt":
                        text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
                        
                        for category, category_highlights in highlights_by_category.items():
                            text_content += f"{category}\n\n"
                            for highlight in category_highlights:
                                text_content += f"Page {highlight['page']}: {highlight['text']}\n"
                                if highlight['comment']:
                                    text_content += f"Comment: {highlight['comment']}\n"
                                text_content += "---\n\n"

                        download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"

                        st.download_button(
                            label="Download Highlights Summary (Text)",
                            data=text_content.encode('utf-8'),  # Ensure proper encoding
                            file_name=download_filename,
                            mime="text/plain"
                        )

                    elif file_format == "Word":
                        doc = Document()
                        doc.add_heading('PDF Highlights Summary', level=1)
                        doc.add_paragraph(f'Total highlights found: {len(highlights)}')

                        for category, category_highlights in highlights_by_category.items():
                            doc.add_heading(category, level=2)
                            for highlight in category_highlights:
                                doc.add_heading(f'Page {highlight["page"]}', level=3)
                                doc.add_paragraph(highlight['text'])
                                if highlight['comment']:
                                    doc.add_paragraph(f'Comment: {highlight["comment"]}')
                                doc.add_paragraph('---')

                        word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
                        doc.save(word_file_path)

                        with open(word_file_path, "rb") as f:
                            docx_bytes = f.read()
                            
                        st.download_button(
                            label="Download Highlights Summary (Word)",
                            data=docx_bytes,
                            file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
                            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                        )
                else:
                    st.warning("No highlights found in the PDF.")
            
            except Exception as e:
                logger.error(f"Error processing PDF: {str(e)}")
                logger.error(traceback.format_exc())
                st.error(f"Error processing PDF: {str(e)}")
        
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

    except Exception as e:
        logger.error(f"Error processing file: {str(e)}")
        logger.error(traceback.format_exc())
        st.error(f"Error processing file: {str(e)}")

st.sidebar.markdown("""
### About
This app extracts and categorizes highlights from PDF files based on their colors:

- 💡 Light Blue: Ideas & Insights
- 📝 Yellow: General Notes
- ✅ Green: Action Items / To-Do
- 📖 Pink: Quotes & References
- ⚠️ Red: Critical Issues / Warnings
""")