Spaces:

gnlui
/

fincatch-ocr

Sleeping

File size: 6,225 Bytes

0bad002

import fitz  # PyMuPDF
import numpy as np
import re
from utils.decompose import Decomposer

def clean_chinese_text(text):
    """Clean up text by removing spaces between Chinese characters."""
    # Remove spaces between Chinese characters
    text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
    # Remove spaces before and after Chinese punctuation
    text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
    text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
    return text.strip()


def categorize_highlight(color):
    """Categorizes highlights based on the closest color match using Euclidean distance."""
    # customize the categories of highlights as you
    color_mapping = {
        (0.5608, 0.8706, 0.9765): "Ideas & Insights",  # Light Blue
        (1.0, 0.9412, 0.4): "General Notes",  # Yellow
        (0.4902, 0.9412, 0.4): "Action Items / To-Do",  # Green
        (0.9686, 0.6, 0.8196): "Quotes & References",  # Pink
        (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings"  # Red
    }

    # Convert color to a NumPy array for distance calculation
    color_array = np.array(color)

    # Find the closest color in the mapping using Euclidean distance
    best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)

    return color_mapping[best_match]


def clean_text_by_punctuation(text):
    """Clean text by removing content after the last proper punctuation mark."""
    # Define proper ending punctuation marks (both Chinese and English)
    ending_punctuation = {'.', '。', '?', '？', '!', '！'}
    
    # Find the last occurrence of any ending punctuation
    last_punct_index = -1
    for i, char in enumerate(text):
        if char in ending_punctuation:
            last_punct_index = i
    
    # If no proper ending punctuation found, return empty string
    if last_punct_index == -1:
        return ""
    
    # Return text up to and including the last punctuation mark
    return text[:last_punct_index + 1]


def extract_highlights(pdf_path):
    """
    Extract all highlights from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        list: List of dictionaries containing highlight information
    """
    highlights = []
    try:
        pdf_document = fitz.open(pdf_path)
        
        min_y, max_y, single_y = Decomposer(pdf_document).run()
        highlights=[]
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            
            for annot in page.annots():
                if annot.type[0] == 8:  # Highlight annotation
                    # Extract highlighted text
                    highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
                    highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
                    
                    # Extract annotation color
                    color_rgb = annot.colors.get("stroke", [0, 0, 0])  # Default black if undefined
                    category = categorize_highlight(color_rgb)

                    # Extract popup comment if it exists
                    comment = annot.info.get("content", "").strip() if annot.has_popup else ""
                    # Store structured highlight data
                    if highlight_text:
                        for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights 
                            cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
                            if not cleaned_chunk:  # Skip if no valid text after cleaning
                                continue
                                
                            if (highlights and 
                                highlights[-1]['page'] == page_num and
                                highlights[-1]['rect'][3] > (max_y - single_y) and
                                annot.rect[1]< (min_y + single_y) 
                            ): # Handle highlights over page
                                highlights[-1]['text'] += cleaned_chunk
                                
                            else:
                                highlights.append({
                                    "page": page_num + 1,
                                    "text": cleaned_chunk,
                                    "category": category,
                                    "comment": comment,
                                    "rect":annot.rect
                                })

        pdf_document.close()
        return highlights
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return []


def main():
    # Example usage
    pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
    highlights = extract_highlights(pdf_path)
    
    # Create markdown content
    markdown_content = "# PDF Highlights Summary\n\n"
    markdown_content += f"Total highlights found: {len(highlights)}\n\n"
    
    # Group highlights by category
    highlights_by_category = {}
    for highlight in highlights:
        category = highlight['category']
        if category not in highlights_by_category:
            highlights_by_category[category] = []
        highlights_by_category[category].append(highlight)
    
    # Add highlights grouped by category
    for category, category_highlights in highlights_by_category.items():
        markdown_content += f"## {category}\n\n"
        for highlight in category_highlights:
            markdown_content += f"### Page {highlight['page']}\n\n"
            markdown_content += f"{highlight['text']}\n\n"
            if highlight['comment']:
                markdown_content += f"> {highlight['comment']}\n\n"
        markdown_content += "---\n\n"
    
    # Save to markdown file
    output_file = "highlights_summary.md"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown_content)
    
    print(f"Highlights summary has been saved to {output_file}")


if __name__ == "__main__":
    main()