Spaces:
Sleeping
Sleeping
File size: 9,802 Bytes
76b7b9f 0bad002 76b7b9f 0bad002 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import streamlit as st
import os
import logging
from PDF_highlight_extractor import extract_highlights, clean_chinese_text
from docx import Document
import urllib.parse
import tempfile
import shutil
import re
import traceback
import fitz # PyMuPDF
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def sanitize_filename(filename):
"""Sanitize filename for web use while preserving Chinese characters."""
try:
filename = urllib.parse.unquote(filename)
filename = os.path.basename(filename)
# Use a safer regex for filename sanitization
filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
name, ext = os.path.splitext(filename)
if len(name) > 100:
name = name[:100]
return name + ext
except Exception as e:
logger.warning(f"Error sanitizing filename: {str(e)}")
# Fallback to a simple safe name
return f"upload_{hash(str(filename))}.pdf"
st.set_page_config(
page_title="PDF Highlight Extractor",
page_icon="π",
layout="wide"
)
st.title("π PDF Highlight Extractor")
st.markdown("Upload a PDF file to extract and categorize highlights.")
# Display PyMuPDF version for debugging
st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
try:
temp_dir = tempfile.mkdtemp()
try:
# Add file size check
file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB
if file_size > 200: # Streamlit Cloud's default limit
st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
st.stop()
original_filename = uploaded_file.name
logger.info(f"Processing file: {original_filename}")
safe_filename = sanitize_filename(original_filename)
logger.info(f"Sanitized filename: {safe_filename}")
file_extension = os.path.splitext(safe_filename)[1]
temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getvalue())
logger.info(f"Saved to temp file: {temp_file_path}")
try:
with st.spinner("Extracting highlights..."):
# Test if we can open the PDF first
try:
test_pdf = fitz.open(temp_file_path)
logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
test_pdf.close()
except Exception as pdf_error:
logger.error(f"Failed to open PDF: {str(pdf_error)}")
st.error(f"Failed to open PDF: {str(pdf_error)}")
raise
# Extract highlights
highlights = extract_highlights(temp_file_path)
logger.info(f"Extracted {len(highlights)} highlights")
if highlights:
st.success(f"Found {len(highlights)} highlights!")
highlights_by_category = {}
for highlight in highlights:
category = highlight['category']
if category not in highlights_by_category:
highlights_by_category[category] = []
highlights_by_category[category].append(highlight)
for category, category_highlights in highlights_by_category.items():
with st.expander(f"π {category} ({len(category_highlights)} highlights)"):
for highlight in category_highlights:
st.markdown(f"**Page {highlight['page']}**")
try:
st.markdown(highlight['text'])
except Exception as text_error:
clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
st.text(clean_text) # Fallback to plain text
if highlight['comment']:
try:
st.markdown(f"> {highlight['comment']}")
except Exception:
st.text(f"Comment: {highlight['comment']}")
st.markdown("---")
file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])
if file_format == "Markdown":
markdown_content = "# PDF Highlights Summary\n\n"
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
for category, category_highlights in highlights_by_category.items():
markdown_content += f"## {category}\n\n"
for highlight in category_highlights:
markdown_content += f"### Page {highlight['page']}\n\n"
markdown_content += f"{highlight['text']}\n\n"
if highlight['comment']:
markdown_content += f"> {highlight['comment']}\n\n"
markdown_content += "---\n\n"
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"
st.download_button(
label="Download Highlights Summary (Markdown)",
data=markdown_content.encode('utf-8'), # Ensure proper encoding
file_name=download_filename,
mime="text/markdown"
)
elif file_format == "Txt":
text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
for category, category_highlights in highlights_by_category.items():
text_content += f"{category}\n\n"
for highlight in category_highlights:
text_content += f"Page {highlight['page']}: {highlight['text']}\n"
if highlight['comment']:
text_content += f"Comment: {highlight['comment']}\n"
text_content += "---\n\n"
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"
st.download_button(
label="Download Highlights Summary (Text)",
data=text_content.encode('utf-8'), # Ensure proper encoding
file_name=download_filename,
mime="text/plain"
)
elif file_format == "Word":
doc = Document()
doc.add_heading('PDF Highlights Summary', level=1)
doc.add_paragraph(f'Total highlights found: {len(highlights)}')
for category, category_highlights in highlights_by_category.items():
doc.add_heading(category, level=2)
for highlight in category_highlights:
doc.add_heading(f'Page {highlight["page"]}', level=3)
doc.add_paragraph(highlight['text'])
if highlight['comment']:
doc.add_paragraph(f'Comment: {highlight["comment"]}')
doc.add_paragraph('---')
word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
doc.save(word_file_path)
with open(word_file_path, "rb") as f:
docx_bytes = f.read()
st.download_button(
label="Download Highlights Summary (Word)",
data=docx_bytes,
file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
else:
st.warning("No highlights found in the PDF.")
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
logger.error(traceback.format_exc())
st.error(f"Error processing PDF: {str(e)}")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as e:
logger.error(f"Error processing file: {str(e)}")
logger.error(traceback.format_exc())
st.error(f"Error processing file: {str(e)}")
st.sidebar.markdown("""
### About
This app extracts and categorizes highlights from PDF files based on their colors:
- π‘ Light Blue: Ideas & Insights
- π Yellow: General Notes
- β
Green: Action Items / To-Do
- π Pink: Quotes & References
- β οΈ Red: Critical Issues / Warnings
""") |