Spaces:

gnlui
/

fincatch-ocr

Build error

App Files Files Community

gnlui commited on May 19, 2025

Commit

0bad002

1 Parent(s): 6cfad53

initial

Browse files

Files changed (4) hide show

requirements.txt +7 -3
src/PDF_highlight_extractor.py +152 -0
src/streamlit_app.py +207 -38
src/utils/decompose.py +199 -0

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+setuptools
+PyMuPDF>=1.22.0
+numpy
+streamlit>=1.28.0
+scikit-learn
+python-docx>=0.8.11
+urllib3>=2.0.0

src/PDF_highlight_extractor.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import fitz  # PyMuPDF
+import numpy as np
+import re
+from utils.decompose import Decomposer
+def clean_chinese_text(text):
+    """Clean up text by removing spaces between Chinese characters."""
+    # Remove spaces between Chinese characters
+    text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
+    # Remove spaces before and after Chinese punctuation
+    text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
+    text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
+    return text.strip()
+def categorize_highlight(color):
+    """Categorizes highlights based on the closest color match using Euclidean distance."""
+    # customize the categories of highlights as you
+    color_mapping = {
+        (0.5608, 0.8706, 0.9765): "Ideas & Insights",  # Light Blue
+        (1.0, 0.9412, 0.4): "General Notes",  # Yellow
+        (0.4902, 0.9412, 0.4): "Action Items / To-Do",  # Green
+        (0.9686, 0.6, 0.8196): "Quotes & References",  # Pink
+        (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings"  # Red
+    }
+    # Convert color to a NumPy array for distance calculation
+    color_array = np.array(color)
+    # Find the closest color in the mapping using Euclidean distance
+    best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)
+    return color_mapping[best_match]
+def clean_text_by_punctuation(text):
+    """Clean text by removing content after the last proper punctuation mark."""
+    # Define proper ending punctuation marks (both Chinese and English)
+    ending_punctuation = {'.', '。', '?', '？', '!', '！'}
+    # Find the last occurrence of any ending punctuation
+    last_punct_index = -1
+    for i, char in enumerate(text):
+        if char in ending_punctuation:
+            last_punct_index = i
+    # If no proper ending punctuation found, return empty string
+    if last_punct_index == -1:
+        return ""
+    # Return text up to and including the last punctuation mark
+    return text[:last_punct_index + 1]
+def extract_highlights(pdf_path):
+    """
+    Extract all highlights from a PDF file.
+    Args:
+        pdf_path (str): Path to the PDF file
+    Returns:
+        list: List of dictionaries containing highlight information
+    """
+    highlights = []
+    try:
+        pdf_document = fitz.open(pdf_path)
+        min_y, max_y, single_y = Decomposer(pdf_document).run()
+        highlights=[]
+        for page_num in range(pdf_document.page_count):
+            page = pdf_document[page_num]
+            for annot in page.annots():
+                if annot.type[0] == 8:  # Highlight annotation
+                    # Extract highlighted text
+                    highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
+                    highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
+                    # Extract annotation color
+                    color_rgb = annot.colors.get("stroke", [0, 0, 0])  # Default black if undefined
+                    category = categorize_highlight(color_rgb)
+                    # Extract popup comment if it exists
+                    comment = annot.info.get("content", "").strip() if annot.has_popup else ""
+                    # Store structured highlight data
+                    if highlight_text:
+                        for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
+                            cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
+                            if not cleaned_chunk:  # Skip if no valid text after cleaning
+                                continue
+                            if (highlights and
+                                highlights[-1]['page'] == page_num and
+                                highlights[-1]['rect'][3] > (max_y - single_y) and
+                                annot.rect[1]< (min_y + single_y)
+                            ): # Handle highlights over page
+                                highlights[-1]['text'] += cleaned_chunk
+                            else:
+                                highlights.append({
+                                    "page": page_num + 1,
+                                    "text": cleaned_chunk,
+                                    "category": category,
+                                    "comment": comment,
+                                    "rect":annot.rect
+                                })
+        pdf_document.close()
+        return highlights
+    except Exception as e:
+        print(f"Error processing PDF: {str(e)}")
+        return []
+def main():
+    # Example usage
+    pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
+    highlights = extract_highlights(pdf_path)
+    # Create markdown content
+    markdown_content = "# PDF Highlights Summary\n\n"
+    markdown_content += f"Total highlights found: {len(highlights)}\n\n"
+    # Group highlights by category
+    highlights_by_category = {}
+    for highlight in highlights:
+        category = highlight['category']
+        if category not in highlights_by_category:
+            highlights_by_category[category] = []
+        highlights_by_category[category].append(highlight)
+    # Add highlights grouped by category
+    for category, category_highlights in highlights_by_category.items():
+        markdown_content += f"## {category}\n\n"
+        for highlight in category_highlights:
+            markdown_content += f"### Page {highlight['page']}\n\n"
+            markdown_content += f"{highlight['text']}\n\n"
+            if highlight['comment']:
+                markdown_content += f"> {highlight['comment']}\n\n"
+        markdown_content += "---\n\n"
+    # Save to markdown file
+    output_file = "highlights_summary.md"
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+    print(f"Highlights summary has been saved to {output_file}")
+if __name__ == "__main__":
+    main()

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,209 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import os
+import logging
+from PDF_highlight_extractor import extract_highlights, clean_chinese_text
+from docx import Document
+import urllib.parse
+import tempfile
+import shutil
+import re
+import traceback
+import fitz  # PyMuPDF
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def sanitize_filename(filename):
+    """Sanitize filename for web use while preserving Chinese characters."""
+    try:
+        filename = urllib.parse.unquote(filename)
+        filename = os.path.basename(filename)
+        # Use a safer regex for filename sanitization
+        filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
+        name, ext = os.path.splitext(filename)
+        if len(name) > 100:
+            name = name[:100]
+        return name + ext
+    except Exception as e:
+        logger.warning(f"Error sanitizing filename: {str(e)}")
+        # Fallback to a simple safe name
+        return f"upload_{hash(str(filename))}.pdf"
+st.set_page_config(
+    page_title="PDF Highlight Extractor",
+    page_icon="📄",
+    layout="wide"
+)
+st.title("📄 PDF Highlight Extractor")
+st.markdown("Upload a PDF file to extract and categorize highlights.")
+# Display PyMuPDF version for debugging
+st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+if uploaded_file is not None:
+    try:
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Add file size check
+            file_size = len(uploaded_file.getvalue()) / (1024 * 1024)  # Size in MB
+            if file_size > 200:  # Streamlit Cloud's default limit
+                st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
+                st.stop()
+            original_filename = uploaded_file.name
+            logger.info(f"Processing file: {original_filename}")
+            safe_filename = sanitize_filename(original_filename)
+            logger.info(f"Sanitized filename: {safe_filename}")
+            file_extension = os.path.splitext(safe_filename)[1]
+            temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
+            with open(temp_file_path, "wb") as f:
+                f.write(uploaded_file.getvalue())
+            logger.info(f"Saved to temp file: {temp_file_path}")
+            try:
+                with st.spinner("Extracting highlights..."):
+                    # Test if we can open the PDF first
+                    try:
+                        test_pdf = fitz.open(temp_file_path)
+                        logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
+                        test_pdf.close()
+                    except Exception as pdf_error:
+                        logger.error(f"Failed to open PDF: {str(pdf_error)}")
+                        st.error(f"Failed to open PDF: {str(pdf_error)}")
+                        raise
+                    # Extract highlights
+                    highlights = extract_highlights(temp_file_path)
+                    logger.info(f"Extracted {len(highlights)} highlights")
+                if highlights:
+                    st.success(f"Found {len(highlights)} highlights!")
+                    highlights_by_category = {}
+                    for highlight in highlights:
+                        category = highlight['category']
+                        if category not in highlights_by_category:
+                            highlights_by_category[category] = []
+                        highlights_by_category[category].append(highlight)
+                    for category, category_highlights in highlights_by_category.items():
+                        with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"):
+                            for highlight in category_highlights:
+                                st.markdown(f"**Page {highlight['page']}**")
+                                try:
+                                    st.markdown(highlight['text'])
+                                except Exception as text_error:
+                                    clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
+                                    st.text(clean_text)  # Fallback to plain text
+                                if highlight['comment']:
+                                    try:
+                                        st.markdown(f"> {highlight['comment']}")
+                                    except Exception:
+                                        st.text(f"Comment: {highlight['comment']}")
+                                st.markdown("---")
+                    file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])
+                    if file_format == "Markdown":
+                        markdown_content = "# PDF Highlights Summary\n\n"
+                        markdown_content += f"Total highlights found: {len(highlights)}\n\n"
+                        for category, category_highlights in highlights_by_category.items():
+                            markdown_content += f"## {category}\n\n"
+                            for highlight in category_highlights:
+                                markdown_content += f"### Page {highlight['page']}\n\n"
+                                markdown_content += f"{highlight['text']}\n\n"
+                                if highlight['comment']:
+                                    markdown_content += f"> {highlight['comment']}\n\n"
+                            markdown_content += "---\n\n"
+                        download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"
+                        st.download_button(
+                            label="Download Highlights Summary (Markdown)",
+                            data=markdown_content.encode('utf-8'),  # Ensure proper encoding
+                            file_name=download_filename,
+                            mime="text/markdown"
+                        )
+                    elif file_format == "Txt":
+                        text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
+                        for category, category_highlights in highlights_by_category.items():
+                            text_content += f"{category}\n\n"
+                            for highlight in category_highlights:
+                                text_content += f"Page {highlight['page']}: {highlight['text']}\n"
+                                if highlight['comment']:
+                                    text_content += f"Comment: {highlight['comment']}\n"
+                                text_content += "---\n\n"
+                        download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"
+                        st.download_button(
+                            label="Download Highlights Summary (Text)",
+                            data=text_content.encode('utf-8'),  # Ensure proper encoding
+                            file_name=download_filename,
+                            mime="text/plain"
+                        )
+                    elif file_format == "Word":
+                        doc = Document()
+                        doc.add_heading('PDF Highlights Summary', level=1)
+                        doc.add_paragraph(f'Total highlights found: {len(highlights)}')
+                        for category, category_highlights in highlights_by_category.items():
+                            doc.add_heading(category, level=2)
+                            for highlight in category_highlights:
+                                doc.add_heading(f'Page {highlight["page"]}', level=3)
+                                doc.add_paragraph(highlight['text'])
+                                if highlight['comment']:
+                                    doc.add_paragraph(f'Comment: {highlight["comment"]}')
+                                doc.add_paragraph('---')
+                        word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
+                        doc.save(word_file_path)
+                        with open(word_file_path, "rb") as f:
+                            docx_bytes = f.read()
+                        st.download_button(
+                            label="Download Highlights Summary (Word)",
+                            data=docx_bytes,
+                            file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
+                            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                        )
+                else:
+                    st.warning("No highlights found in the PDF.")
+            except Exception as e:
+                logger.error(f"Error processing PDF: {str(e)}")
+                logger.error(traceback.format_exc())
+                st.error(f"Error processing PDF: {str(e)}")
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+    except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        logger.error(traceback.format_exc())
+        st.error(f"Error processing file: {str(e)}")
+st.sidebar.markdown("""
+### About
+This app extracts and categorizes highlights from PDF files based on their colors:
+- 💡 Light Blue: Ideas & Insights
+- 📝 Yellow: General Notes
+- ✅ Green: Action Items / To-Do
+- 📖 Pink: Quotes & References
+- ⚠️ Red: Critical Issues / Warnings
+""")

src/utils/decompose.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from sklearn.cluster import DBSCAN
+import numpy as np
+from itertools import islice
+from collections import Counter
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DBSCAN_helper:
+    def __init__(self, blocks):
+        self.blocks = blocks
+    def run(self):
+        try:
+            if not self.blocks:
+                logger.warning("No blocks provided to DBSCAN_helper")
+                # Return default values
+                self.n_clusters = 0
+                self.labels = np.array([])
+                return
+            # Extract features from blocks
+            X = np.array(
+                [(x0, y0, x1, y1, len(text)) for x0, y0, x1, y1, text in self.blocks]
+            )
+            # Handle empty array
+            if X.size == 0:
+                logger.warning("Empty feature array for DBSCAN")
+                self.n_clusters = 0
+                self.labels = np.array([])
+                return
+            # Configure DBSCAN with explicit parameters for better control
+            dbscan = DBSCAN(eps=0.5, min_samples=2, metric='euclidean')
+            dbscan.fit(X)
+            labels = dbscan.labels_
+            # Count the number of clusters (excluding noise points marked as -1)
+            unique_labels = set(labels)
+            if -1 in unique_labels:
+                unique_labels.remove(-1)
+            self.n_clusters = len(unique_labels)
+            self.labels = labels
+            logger.info(f"{self.n_clusters} clusters for {len(self.blocks)} blocks")
+        except Exception as e:
+            logger.error(f"Error in DBSCAN_helper: {str(e)}")
+            # Set default values on error
+            self.n_clusters = 0
+            self.labels = np.array([-1] * len(self.blocks)) if self.blocks else np.array([])
+class Decomposer:
+    def __init__(self, pdf_document=None):
+        if not pdf_document:
+            raise ValueError("PDF document must be provided")
+        self.pdf_doc = pdf_document
+    def calc_rect_center(self, rect, reverse_y=False):
+        try:
+            if reverse_y:
+                x0, y0, x1, y1 = rect[0], -rect[1], rect[2], -rect[3]
+            else:
+                x0, y0, x1, y1 = rect
+            x_center = (x0 + x1) / 2
+            y_center = (y0 + y1) / 2
+            return (x_center, y_center)
+        except Exception as e:
+            logger.error(f"Error calculating rectangle center: {str(e)}")
+            return (0, 0)  # Return default values on error
+    def get_rect_labels(self):
+        try:
+            rect_centers = []
+            rects = []
+            visual_label_texts = []
+            categorize_vectors = []
+            for page_idx, page in islice(enumerate(self.pdf_doc), len(self.pdf_doc)):
+                try:
+                    blocks = page.get_text("blocks")
+                    page_cnt = page_idx + 1
+                    logger.debug(f"=== Start Page {page_cnt}: {len(blocks)} blocks ===")
+                    block_cnt = 0
+                    for block in blocks:
+                        try:
+                            block_rect = block[:4]  # (x0,y0,x1,y1)
+                            x0, y0, x1, y1 = block_rect
+                            rects.append(block_rect)
+                            # Handle possible encoding issues with block text
+                            block_text = block[4]
+                            if isinstance(block_text, bytes):
+                                block_text = block_text.decode('utf-8', errors='ignore')
+                            block_num = block[5]
+                            block_cnt = block_num + 1
+                            rect_center = self.calc_rect_center(block_rect, reverse_y=True)
+                            rect_centers.append(rect_center)
+                            visual_label_text = f"({page_cnt}.{block_cnt})"
+                            visual_label_texts.append(visual_label_text)
+                            #block_type = "text" if block[6] == 0 else "image"
+                            categorize_vectors.append((*block_rect, block_text))
+                        except Exception as block_error:
+                            logger.warning(f"Error processing block {block_cnt} on page {page_cnt}: {str(block_error)}")
+                            continue
+                except Exception as page_error:
+                    logger.warning(f"Error processing page {page_idx + 1}: {str(page_error)}")
+                    continue
+            if not categorize_vectors:
+                logger.warning("No categorize vectors generated")
+                return []
+            categorizer = DBSCAN_helper(categorize_vectors)
+            categorizer.run()
+            # Make sure the lengths match
+            if len(rects) != len(categorizer.labels):
+                logger.warning(f"Length mismatch: rects={len(rects)}, labels={len(categorizer.labels)}")
+                # Handle mismatch by creating default labels
+                if categorizer.labels.size == 0:  # If labels array is empty
+                    result = [(rect, -1) for rect in rects]  # Assign all to noise (-1)
+                else:
+                    # Truncate to shorter length
+                    min_len = min(len(rects), len(categorizer.labels))
+                    result = [(rects[i], categorizer.labels[i]) for i in range(min_len)]
+                return result
+            return [(rects[i], categorizer.labels[i]) for i in range(len(rects))]
+        except Exception as e:
+            logger.error(f"Error in get_rect_labels: {str(e)}")
+            return []  # Return empty result on error
+    def get_page_stats(self, res):
+        try:
+            if not res:
+                logger.warning("Empty input to get_page_stats")
+                return None, None, None  # Handle empty input
+            x_counter = Counter(x for _, x in res)
+            y_diffs = Counter(i[3] - i[1] for i, _ in res)
+            # Handle empty counters
+            if not x_counter or not y_diffs:
+                logger.warning("Empty counters in get_page_stats")
+                return None, None, None
+            most_common_x = x_counter.most_common(1)[0][0]
+            threshold = float('inf')
+            min_x = float('inf')
+            for i, x in res:
+                min_x = min(i[0], min_x)
+                if x != most_common_x and i[0] < threshold:
+                    threshold = i[0]
+            if threshold == float('inf'):  # Fallback
+                threshold = min_x
+            min_y, max_y = float('inf'), -float('inf')  # Changed from 0 to -inf
+            for i, x in res:
+                if x == -1 and i[0] <= threshold:
+                    min_y = min(min_y, i[1])
+                    max_y = max(max_y, i[-1])
+            single_y = y_diffs.most_common(1)[0][0] if y_diffs else 0
+            # Additional validity checks
+            if min_y == float('inf'):
+                min_y = None
+            if max_y == -float('inf'):
+                max_y = None
+            # Ensure single_y is positive
+            single_y = abs(single_y) if single_y else 0
+            return min_y, max_y, single_y
+        except Exception as e:
+            logger.error(f"Error in get_page_stats: {str(e)}")
+            return None, None, None  # Return default values on error
+    def run(self):
+        try:
+            rect_labels = self.get_rect_labels()
+            stats = self.get_page_stats(rect_labels)
+            logger.info(f"Page stats: min_y={stats[0]}, max_y={stats[1]}, single_y={stats[2]}")
+            return stats
+        except Exception as e:
+            logger.error(f"Error in Decomposer.run: {str(e)}")
+            return None, None, None  # Return default values on error