Spaces:
Sleeping
Sleeping
initial
Browse files- requirements.txt +7 -3
- src/PDF_highlight_extractor.py +152 -0
- src/streamlit_app.py +207 -38
- src/utils/decompose.py +199 -0
requirements.txt
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setuptools
|
| 2 |
+
PyMuPDF>=1.22.0
|
| 3 |
+
numpy
|
| 4 |
+
streamlit>=1.28.0
|
| 5 |
+
scikit-learn
|
| 6 |
+
python-docx>=0.8.11
|
| 7 |
+
urllib3>=2.0.0
|
src/PDF_highlight_extractor.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import numpy as np
|
| 3 |
+
import re
|
| 4 |
+
from utils.decompose import Decomposer
|
| 5 |
+
|
| 6 |
+
def clean_chinese_text(text):
|
| 7 |
+
"""Clean up text by removing spaces between Chinese characters."""
|
| 8 |
+
# Remove spaces between Chinese characters
|
| 9 |
+
text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
|
| 10 |
+
# Remove spaces before and after Chinese punctuation
|
| 11 |
+
text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
|
| 12 |
+
text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
|
| 13 |
+
return text.strip()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def categorize_highlight(color):
|
| 17 |
+
"""Categorizes highlights based on the closest color match using Euclidean distance."""
|
| 18 |
+
# customize the categories of highlights as you
|
| 19 |
+
color_mapping = {
|
| 20 |
+
(0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue
|
| 21 |
+
(1.0, 0.9412, 0.4): "General Notes", # Yellow
|
| 22 |
+
(0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green
|
| 23 |
+
(0.9686, 0.6, 0.8196): "Quotes & References", # Pink
|
| 24 |
+
(0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Convert color to a NumPy array for distance calculation
|
| 28 |
+
color_array = np.array(color)
|
| 29 |
+
|
| 30 |
+
# Find the closest color in the mapping using Euclidean distance
|
| 31 |
+
best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)
|
| 32 |
+
|
| 33 |
+
return color_mapping[best_match]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def clean_text_by_punctuation(text):
|
| 37 |
+
"""Clean text by removing content after the last proper punctuation mark."""
|
| 38 |
+
# Define proper ending punctuation marks (both Chinese and English)
|
| 39 |
+
ending_punctuation = {'.', '。', '?', '?', '!', '!'}
|
| 40 |
+
|
| 41 |
+
# Find the last occurrence of any ending punctuation
|
| 42 |
+
last_punct_index = -1
|
| 43 |
+
for i, char in enumerate(text):
|
| 44 |
+
if char in ending_punctuation:
|
| 45 |
+
last_punct_index = i
|
| 46 |
+
|
| 47 |
+
# If no proper ending punctuation found, return empty string
|
| 48 |
+
if last_punct_index == -1:
|
| 49 |
+
return ""
|
| 50 |
+
|
| 51 |
+
# Return text up to and including the last punctuation mark
|
| 52 |
+
return text[:last_punct_index + 1]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def extract_highlights(pdf_path):
|
| 56 |
+
"""
|
| 57 |
+
Extract all highlights from a PDF file.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
pdf_path (str): Path to the PDF file
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
list: List of dictionaries containing highlight information
|
| 64 |
+
"""
|
| 65 |
+
highlights = []
|
| 66 |
+
try:
|
| 67 |
+
pdf_document = fitz.open(pdf_path)
|
| 68 |
+
|
| 69 |
+
min_y, max_y, single_y = Decomposer(pdf_document).run()
|
| 70 |
+
highlights=[]
|
| 71 |
+
for page_num in range(pdf_document.page_count):
|
| 72 |
+
page = pdf_document[page_num]
|
| 73 |
+
|
| 74 |
+
for annot in page.annots():
|
| 75 |
+
if annot.type[0] == 8: # Highlight annotation
|
| 76 |
+
# Extract highlighted text
|
| 77 |
+
highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
|
| 78 |
+
highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
|
| 79 |
+
|
| 80 |
+
# Extract annotation color
|
| 81 |
+
color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined
|
| 82 |
+
category = categorize_highlight(color_rgb)
|
| 83 |
+
|
| 84 |
+
# Extract popup comment if it exists
|
| 85 |
+
comment = annot.info.get("content", "").strip() if annot.has_popup else ""
|
| 86 |
+
# Store structured highlight data
|
| 87 |
+
if highlight_text:
|
| 88 |
+
for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
|
| 89 |
+
cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
|
| 90 |
+
if not cleaned_chunk: # Skip if no valid text after cleaning
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
if (highlights and
|
| 94 |
+
highlights[-1]['page'] == page_num and
|
| 95 |
+
highlights[-1]['rect'][3] > (max_y - single_y) and
|
| 96 |
+
annot.rect[1]< (min_y + single_y)
|
| 97 |
+
): # Handle highlights over page
|
| 98 |
+
highlights[-1]['text'] += cleaned_chunk
|
| 99 |
+
|
| 100 |
+
else:
|
| 101 |
+
highlights.append({
|
| 102 |
+
"page": page_num + 1,
|
| 103 |
+
"text": cleaned_chunk,
|
| 104 |
+
"category": category,
|
| 105 |
+
"comment": comment,
|
| 106 |
+
"rect":annot.rect
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
pdf_document.close()
|
| 110 |
+
return highlights
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"Error processing PDF: {str(e)}")
|
| 113 |
+
return []
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def main():
|
| 117 |
+
# Example usage
|
| 118 |
+
pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
|
| 119 |
+
highlights = extract_highlights(pdf_path)
|
| 120 |
+
|
| 121 |
+
# Create markdown content
|
| 122 |
+
markdown_content = "# PDF Highlights Summary\n\n"
|
| 123 |
+
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
|
| 124 |
+
|
| 125 |
+
# Group highlights by category
|
| 126 |
+
highlights_by_category = {}
|
| 127 |
+
for highlight in highlights:
|
| 128 |
+
category = highlight['category']
|
| 129 |
+
if category not in highlights_by_category:
|
| 130 |
+
highlights_by_category[category] = []
|
| 131 |
+
highlights_by_category[category].append(highlight)
|
| 132 |
+
|
| 133 |
+
# Add highlights grouped by category
|
| 134 |
+
for category, category_highlights in highlights_by_category.items():
|
| 135 |
+
markdown_content += f"## {category}\n\n"
|
| 136 |
+
for highlight in category_highlights:
|
| 137 |
+
markdown_content += f"### Page {highlight['page']}\n\n"
|
| 138 |
+
markdown_content += f"{highlight['text']}\n\n"
|
| 139 |
+
if highlight['comment']:
|
| 140 |
+
markdown_content += f"> {highlight['comment']}\n\n"
|
| 141 |
+
markdown_content += "---\n\n"
|
| 142 |
+
|
| 143 |
+
# Save to markdown file
|
| 144 |
+
output_file = "highlights_summary.md"
|
| 145 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 146 |
+
f.write(markdown_content)
|
| 147 |
+
|
| 148 |
+
print(f"Highlights summary has been saved to {output_file}")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
main()
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,209 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
st.
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
from PDF_highlight_extractor import extract_highlights, clean_chinese_text
|
| 5 |
+
from docx import Document
|
| 6 |
+
import urllib.parse
|
| 7 |
+
import tempfile
|
| 8 |
+
import shutil
|
| 9 |
+
import re
|
| 10 |
+
import traceback
|
| 11 |
+
import fitz # PyMuPDF
|
| 12 |
|
| 13 |
+
# Set up logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
def sanitize_filename(filename):
|
| 18 |
+
"""Sanitize filename for web use while preserving Chinese characters."""
|
| 19 |
+
try:
|
| 20 |
+
filename = urllib.parse.unquote(filename)
|
| 21 |
+
filename = os.path.basename(filename)
|
| 22 |
+
# Use a safer regex for filename sanitization
|
| 23 |
+
filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
|
| 24 |
+
name, ext = os.path.splitext(filename)
|
| 25 |
+
if len(name) > 100:
|
| 26 |
+
name = name[:100]
|
| 27 |
+
return name + ext
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.warning(f"Error sanitizing filename: {str(e)}")
|
| 30 |
+
# Fallback to a simple safe name
|
| 31 |
+
return f"upload_{hash(str(filename))}.pdf"
|
| 32 |
+
|
| 33 |
+
st.set_page_config(
|
| 34 |
+
page_title="PDF Highlight Extractor",
|
| 35 |
+
page_icon="📄",
|
| 36 |
+
layout="wide"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
st.title("📄 PDF Highlight Extractor")
|
| 40 |
+
st.markdown("Upload a PDF file to extract and categorize highlights.")
|
| 41 |
+
|
| 42 |
+
# Display PyMuPDF version for debugging
|
| 43 |
+
st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")
|
| 44 |
+
|
| 45 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 46 |
+
|
| 47 |
+
if uploaded_file is not None:
|
| 48 |
+
try:
|
| 49 |
+
temp_dir = tempfile.mkdtemp()
|
| 50 |
+
try:
|
| 51 |
+
# Add file size check
|
| 52 |
+
file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB
|
| 53 |
+
if file_size > 200: # Streamlit Cloud's default limit
|
| 54 |
+
st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
|
| 55 |
+
st.stop()
|
| 56 |
+
|
| 57 |
+
original_filename = uploaded_file.name
|
| 58 |
+
logger.info(f"Processing file: {original_filename}")
|
| 59 |
+
|
| 60 |
+
safe_filename = sanitize_filename(original_filename)
|
| 61 |
+
logger.info(f"Sanitized filename: {safe_filename}")
|
| 62 |
+
|
| 63 |
+
file_extension = os.path.splitext(safe_filename)[1]
|
| 64 |
+
temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
|
| 65 |
+
|
| 66 |
+
with open(temp_file_path, "wb") as f:
|
| 67 |
+
f.write(uploaded_file.getvalue())
|
| 68 |
+
|
| 69 |
+
logger.info(f"Saved to temp file: {temp_file_path}")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
with st.spinner("Extracting highlights..."):
|
| 73 |
+
# Test if we can open the PDF first
|
| 74 |
+
try:
|
| 75 |
+
test_pdf = fitz.open(temp_file_path)
|
| 76 |
+
logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
|
| 77 |
+
test_pdf.close()
|
| 78 |
+
except Exception as pdf_error:
|
| 79 |
+
logger.error(f"Failed to open PDF: {str(pdf_error)}")
|
| 80 |
+
st.error(f"Failed to open PDF: {str(pdf_error)}")
|
| 81 |
+
raise
|
| 82 |
+
|
| 83 |
+
# Extract highlights
|
| 84 |
+
highlights = extract_highlights(temp_file_path)
|
| 85 |
+
logger.info(f"Extracted {len(highlights)} highlights")
|
| 86 |
+
|
| 87 |
+
if highlights:
|
| 88 |
+
st.success(f"Found {len(highlights)} highlights!")
|
| 89 |
+
|
| 90 |
+
highlights_by_category = {}
|
| 91 |
+
for highlight in highlights:
|
| 92 |
+
category = highlight['category']
|
| 93 |
+
if category not in highlights_by_category:
|
| 94 |
+
highlights_by_category[category] = []
|
| 95 |
+
highlights_by_category[category].append(highlight)
|
| 96 |
+
|
| 97 |
+
for category, category_highlights in highlights_by_category.items():
|
| 98 |
+
with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"):
|
| 99 |
+
for highlight in category_highlights:
|
| 100 |
+
st.markdown(f"**Page {highlight['page']}**")
|
| 101 |
+
try:
|
| 102 |
+
st.markdown(highlight['text'])
|
| 103 |
+
except Exception as text_error:
|
| 104 |
+
clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
|
| 105 |
+
st.text(clean_text) # Fallback to plain text
|
| 106 |
+
|
| 107 |
+
if highlight['comment']:
|
| 108 |
+
try:
|
| 109 |
+
st.markdown(f"> {highlight['comment']}")
|
| 110 |
+
except Exception:
|
| 111 |
+
st.text(f"Comment: {highlight['comment']}")
|
| 112 |
+
st.markdown("---")
|
| 113 |
+
|
| 114 |
+
file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])
|
| 115 |
+
|
| 116 |
+
if file_format == "Markdown":
|
| 117 |
+
markdown_content = "# PDF Highlights Summary\n\n"
|
| 118 |
+
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
|
| 119 |
+
|
| 120 |
+
for category, category_highlights in highlights_by_category.items():
|
| 121 |
+
markdown_content += f"## {category}\n\n"
|
| 122 |
+
for highlight in category_highlights:
|
| 123 |
+
markdown_content += f"### Page {highlight['page']}\n\n"
|
| 124 |
+
markdown_content += f"{highlight['text']}\n\n"
|
| 125 |
+
if highlight['comment']:
|
| 126 |
+
markdown_content += f"> {highlight['comment']}\n\n"
|
| 127 |
+
markdown_content += "---\n\n"
|
| 128 |
+
|
| 129 |
+
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"
|
| 130 |
+
|
| 131 |
+
st.download_button(
|
| 132 |
+
label="Download Highlights Summary (Markdown)",
|
| 133 |
+
data=markdown_content.encode('utf-8'), # Ensure proper encoding
|
| 134 |
+
file_name=download_filename,
|
| 135 |
+
mime="text/markdown"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
elif file_format == "Txt":
|
| 139 |
+
text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
|
| 140 |
+
|
| 141 |
+
for category, category_highlights in highlights_by_category.items():
|
| 142 |
+
text_content += f"{category}\n\n"
|
| 143 |
+
for highlight in category_highlights:
|
| 144 |
+
text_content += f"Page {highlight['page']}: {highlight['text']}\n"
|
| 145 |
+
if highlight['comment']:
|
| 146 |
+
text_content += f"Comment: {highlight['comment']}\n"
|
| 147 |
+
text_content += "---\n\n"
|
| 148 |
+
|
| 149 |
+
download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"
|
| 150 |
+
|
| 151 |
+
st.download_button(
|
| 152 |
+
label="Download Highlights Summary (Text)",
|
| 153 |
+
data=text_content.encode('utf-8'), # Ensure proper encoding
|
| 154 |
+
file_name=download_filename,
|
| 155 |
+
mime="text/plain"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
elif file_format == "Word":
|
| 159 |
+
doc = Document()
|
| 160 |
+
doc.add_heading('PDF Highlights Summary', level=1)
|
| 161 |
+
doc.add_paragraph(f'Total highlights found: {len(highlights)}')
|
| 162 |
+
|
| 163 |
+
for category, category_highlights in highlights_by_category.items():
|
| 164 |
+
doc.add_heading(category, level=2)
|
| 165 |
+
for highlight in category_highlights:
|
| 166 |
+
doc.add_heading(f'Page {highlight["page"]}', level=3)
|
| 167 |
+
doc.add_paragraph(highlight['text'])
|
| 168 |
+
if highlight['comment']:
|
| 169 |
+
doc.add_paragraph(f'Comment: {highlight["comment"]}')
|
| 170 |
+
doc.add_paragraph('---')
|
| 171 |
+
|
| 172 |
+
word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
|
| 173 |
+
doc.save(word_file_path)
|
| 174 |
+
|
| 175 |
+
with open(word_file_path, "rb") as f:
|
| 176 |
+
docx_bytes = f.read()
|
| 177 |
+
|
| 178 |
+
st.download_button(
|
| 179 |
+
label="Download Highlights Summary (Word)",
|
| 180 |
+
data=docx_bytes,
|
| 181 |
+
file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
|
| 182 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
st.warning("No highlights found in the PDF.")
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
| 189 |
+
logger.error(traceback.format_exc())
|
| 190 |
+
st.error(f"Error processing PDF: {str(e)}")
|
| 191 |
+
|
| 192 |
+
finally:
|
| 193 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logger.error(f"Error processing file: {str(e)}")
|
| 197 |
+
logger.error(traceback.format_exc())
|
| 198 |
+
st.error(f"Error processing file: {str(e)}")
|
| 199 |
+
|
| 200 |
+
st.sidebar.markdown("""
|
| 201 |
+
### About
|
| 202 |
+
This app extracts and categorizes highlights from PDF files based on their colors:
|
| 203 |
+
|
| 204 |
+
- 💡 Light Blue: Ideas & Insights
|
| 205 |
+
- 📝 Yellow: General Notes
|
| 206 |
+
- ✅ Green: Action Items / To-Do
|
| 207 |
+
- 📖 Pink: Quotes & References
|
| 208 |
+
- ⚠️ Red: Critical Issues / Warnings
|
| 209 |
+
""")
|
src/utils/decompose.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from sklearn.cluster import DBSCAN
|
| 3 |
+
import numpy as np
|
| 4 |
+
from itertools import islice
|
| 5 |
+
from collections import Counter
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
# Set up logging
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class DBSCAN_helper:
|
| 13 |
+
def __init__(self, blocks):
|
| 14 |
+
self.blocks = blocks
|
| 15 |
+
|
| 16 |
+
def run(self):
|
| 17 |
+
try:
|
| 18 |
+
if not self.blocks:
|
| 19 |
+
logger.warning("No blocks provided to DBSCAN_helper")
|
| 20 |
+
# Return default values
|
| 21 |
+
self.n_clusters = 0
|
| 22 |
+
self.labels = np.array([])
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
# Extract features from blocks
|
| 26 |
+
X = np.array(
|
| 27 |
+
[(x0, y0, x1, y1, len(text)) for x0, y0, x1, y1, text in self.blocks]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Handle empty array
|
| 31 |
+
if X.size == 0:
|
| 32 |
+
logger.warning("Empty feature array for DBSCAN")
|
| 33 |
+
self.n_clusters = 0
|
| 34 |
+
self.labels = np.array([])
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
# Configure DBSCAN with explicit parameters for better control
|
| 38 |
+
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='euclidean')
|
| 39 |
+
dbscan.fit(X)
|
| 40 |
+
labels = dbscan.labels_
|
| 41 |
+
|
| 42 |
+
# Count the number of clusters (excluding noise points marked as -1)
|
| 43 |
+
unique_labels = set(labels)
|
| 44 |
+
if -1 in unique_labels:
|
| 45 |
+
unique_labels.remove(-1)
|
| 46 |
+
self.n_clusters = len(unique_labels)
|
| 47 |
+
self.labels = labels
|
| 48 |
+
|
| 49 |
+
logger.info(f"{self.n_clusters} clusters for {len(self.blocks)} blocks")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error(f"Error in DBSCAN_helper: {str(e)}")
|
| 52 |
+
# Set default values on error
|
| 53 |
+
self.n_clusters = 0
|
| 54 |
+
self.labels = np.array([-1] * len(self.blocks)) if self.blocks else np.array([])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Decomposer:
|
| 58 |
+
def __init__(self, pdf_document=None):
|
| 59 |
+
if not pdf_document:
|
| 60 |
+
raise ValueError("PDF document must be provided")
|
| 61 |
+
self.pdf_doc = pdf_document
|
| 62 |
+
|
| 63 |
+
def calc_rect_center(self, rect, reverse_y=False):
|
| 64 |
+
try:
|
| 65 |
+
if reverse_y:
|
| 66 |
+
x0, y0, x1, y1 = rect[0], -rect[1], rect[2], -rect[3]
|
| 67 |
+
else:
|
| 68 |
+
x0, y0, x1, y1 = rect
|
| 69 |
+
|
| 70 |
+
x_center = (x0 + x1) / 2
|
| 71 |
+
y_center = (y0 + y1) / 2
|
| 72 |
+
return (x_center, y_center)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Error calculating rectangle center: {str(e)}")
|
| 75 |
+
return (0, 0) # Return default values on error
|
| 76 |
+
|
| 77 |
+
def get_rect_labels(self):
|
| 78 |
+
try:
|
| 79 |
+
rect_centers = []
|
| 80 |
+
rects = []
|
| 81 |
+
visual_label_texts = []
|
| 82 |
+
categorize_vectors = []
|
| 83 |
+
|
| 84 |
+
for page_idx, page in islice(enumerate(self.pdf_doc), len(self.pdf_doc)):
|
| 85 |
+
try:
|
| 86 |
+
blocks = page.get_text("blocks")
|
| 87 |
+
page_cnt = page_idx + 1
|
| 88 |
+
logger.debug(f"=== Start Page {page_cnt}: {len(blocks)} blocks ===")
|
| 89 |
+
block_cnt = 0
|
| 90 |
+
|
| 91 |
+
for block in blocks:
|
| 92 |
+
try:
|
| 93 |
+
block_rect = block[:4] # (x0,y0,x1,y1)
|
| 94 |
+
x0, y0, x1, y1 = block_rect
|
| 95 |
+
rects.append(block_rect)
|
| 96 |
+
|
| 97 |
+
# Handle possible encoding issues with block text
|
| 98 |
+
block_text = block[4]
|
| 99 |
+
if isinstance(block_text, bytes):
|
| 100 |
+
block_text = block_text.decode('utf-8', errors='ignore')
|
| 101 |
+
|
| 102 |
+
block_num = block[5]
|
| 103 |
+
block_cnt = block_num + 1
|
| 104 |
+
|
| 105 |
+
rect_center = self.calc_rect_center(block_rect, reverse_y=True)
|
| 106 |
+
rect_centers.append(rect_center)
|
| 107 |
+
visual_label_text = f"({page_cnt}.{block_cnt})"
|
| 108 |
+
visual_label_texts.append(visual_label_text)
|
| 109 |
+
|
| 110 |
+
#block_type = "text" if block[6] == 0 else "image"
|
| 111 |
+
categorize_vectors.append((*block_rect, block_text))
|
| 112 |
+
except Exception as block_error:
|
| 113 |
+
logger.warning(f"Error processing block {block_cnt} on page {page_cnt}: {str(block_error)}")
|
| 114 |
+
continue
|
| 115 |
+
except Exception as page_error:
|
| 116 |
+
logger.warning(f"Error processing page {page_idx + 1}: {str(page_error)}")
|
| 117 |
+
continue
|
| 118 |
+
|
| 119 |
+
if not categorize_vectors:
|
| 120 |
+
logger.warning("No categorize vectors generated")
|
| 121 |
+
return []
|
| 122 |
+
|
| 123 |
+
categorizer = DBSCAN_helper(categorize_vectors)
|
| 124 |
+
categorizer.run()
|
| 125 |
+
|
| 126 |
+
# Make sure the lengths match
|
| 127 |
+
if len(rects) != len(categorizer.labels):
|
| 128 |
+
logger.warning(f"Length mismatch: rects={len(rects)}, labels={len(categorizer.labels)}")
|
| 129 |
+
# Handle mismatch by creating default labels
|
| 130 |
+
if categorizer.labels.size == 0: # If labels array is empty
|
| 131 |
+
result = [(rect, -1) for rect in rects] # Assign all to noise (-1)
|
| 132 |
+
else:
|
| 133 |
+
# Truncate to shorter length
|
| 134 |
+
min_len = min(len(rects), len(categorizer.labels))
|
| 135 |
+
result = [(rects[i], categorizer.labels[i]) for i in range(min_len)]
|
| 136 |
+
return result
|
| 137 |
+
|
| 138 |
+
return [(rects[i], categorizer.labels[i]) for i in range(len(rects))]
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Error in get_rect_labels: {str(e)}")
|
| 141 |
+
return [] # Return empty result on error
|
| 142 |
+
|
| 143 |
+
def get_page_stats(self, res):
|
| 144 |
+
try:
|
| 145 |
+
if not res:
|
| 146 |
+
logger.warning("Empty input to get_page_stats")
|
| 147 |
+
return None, None, None # Handle empty input
|
| 148 |
+
|
| 149 |
+
x_counter = Counter(x for _, x in res)
|
| 150 |
+
y_diffs = Counter(i[3] - i[1] for i, _ in res)
|
| 151 |
+
|
| 152 |
+
# Handle empty counters
|
| 153 |
+
if not x_counter or not y_diffs:
|
| 154 |
+
logger.warning("Empty counters in get_page_stats")
|
| 155 |
+
return None, None, None
|
| 156 |
+
|
| 157 |
+
most_common_x = x_counter.most_common(1)[0][0]
|
| 158 |
+
threshold = float('inf')
|
| 159 |
+
min_x = float('inf')
|
| 160 |
+
|
| 161 |
+
for i, x in res:
|
| 162 |
+
min_x = min(i[0], min_x)
|
| 163 |
+
if x != most_common_x and i[0] < threshold:
|
| 164 |
+
threshold = i[0]
|
| 165 |
+
|
| 166 |
+
if threshold == float('inf'): # Fallback
|
| 167 |
+
threshold = min_x
|
| 168 |
+
|
| 169 |
+
min_y, max_y = float('inf'), -float('inf') # Changed from 0 to -inf
|
| 170 |
+
for i, x in res:
|
| 171 |
+
if x == -1 and i[0] <= threshold:
|
| 172 |
+
min_y = min(min_y, i[1])
|
| 173 |
+
max_y = max(max_y, i[-1])
|
| 174 |
+
|
| 175 |
+
single_y = y_diffs.most_common(1)[0][0] if y_diffs else 0
|
| 176 |
+
|
| 177 |
+
# Additional validity checks
|
| 178 |
+
if min_y == float('inf'):
|
| 179 |
+
min_y = None
|
| 180 |
+
if max_y == -float('inf'):
|
| 181 |
+
max_y = None
|
| 182 |
+
|
| 183 |
+
# Ensure single_y is positive
|
| 184 |
+
single_y = abs(single_y) if single_y else 0
|
| 185 |
+
|
| 186 |
+
return min_y, max_y, single_y
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error in get_page_stats: {str(e)}")
|
| 189 |
+
return None, None, None # Return default values on error
|
| 190 |
+
|
| 191 |
+
def run(self):
|
| 192 |
+
try:
|
| 193 |
+
rect_labels = self.get_rect_labels()
|
| 194 |
+
stats = self.get_page_stats(rect_labels)
|
| 195 |
+
logger.info(f"Page stats: min_y={stats[0]}, max_y={stats[1]}, single_y={stats[2]}")
|
| 196 |
+
return stats
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.error(f"Error in Decomposer.run: {str(e)}")
|
| 199 |
+
return None, None, None # Return default values on error
|