File size: 6,225 Bytes
0bad002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import fitz  # PyMuPDF
import numpy as np
import re
from utils.decompose import Decomposer

def clean_chinese_text(text):
    """Clean up text by removing spaces between Chinese characters."""
    # Remove spaces between Chinese characters
    text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
    # Remove spaces before and after Chinese punctuation
    text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
    text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
    return text.strip()


def categorize_highlight(color):
    """Categorizes highlights based on the closest color match using Euclidean distance."""
    # customize the categories of highlights as you
    color_mapping = {
        (0.5608, 0.8706, 0.9765): "Ideas & Insights",  # Light Blue
        (1.0, 0.9412, 0.4): "General Notes",  # Yellow
        (0.4902, 0.9412, 0.4): "Action Items / To-Do",  # Green
        (0.9686, 0.6, 0.8196): "Quotes & References",  # Pink
        (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings"  # Red
    }

    # Convert color to a NumPy array for distance calculation
    color_array = np.array(color)

    # Find the closest color in the mapping using Euclidean distance
    best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)

    return color_mapping[best_match]


def clean_text_by_punctuation(text):
    """Clean text by removing content after the last proper punctuation mark."""
    # Define proper ending punctuation marks (both Chinese and English)
    ending_punctuation = {'.', '。', '?', '?', '!', '!'}
    
    # Find the last occurrence of any ending punctuation
    last_punct_index = -1
    for i, char in enumerate(text):
        if char in ending_punctuation:
            last_punct_index = i
    
    # If no proper ending punctuation found, return empty string
    if last_punct_index == -1:
        return ""
    
    # Return text up to and including the last punctuation mark
    return text[:last_punct_index + 1]


def extract_highlights(pdf_path):
    """
    Extract all highlights from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        list: List of dictionaries containing highlight information
    """
    highlights = []
    try:
        pdf_document = fitz.open(pdf_path)
        
        min_y, max_y, single_y = Decomposer(pdf_document).run()
        highlights=[]
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            
            for annot in page.annots():
                if annot.type[0] == 8:  # Highlight annotation
                    # Extract highlighted text
                    highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
                    highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
                    
                    # Extract annotation color
                    color_rgb = annot.colors.get("stroke", [0, 0, 0])  # Default black if undefined
                    category = categorize_highlight(color_rgb)

                    # Extract popup comment if it exists
                    comment = annot.info.get("content", "").strip() if annot.has_popup else ""
                    # Store structured highlight data
                    if highlight_text:
                        for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights 
                            cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
                            if not cleaned_chunk:  # Skip if no valid text after cleaning
                                continue
                                
                            if (highlights and 
                                highlights[-1]['page'] == page_num and
                                highlights[-1]['rect'][3] > (max_y - single_y) and
                                annot.rect[1]< (min_y + single_y) 
                            ): # Handle highlights over page
                                highlights[-1]['text'] += cleaned_chunk
                                
                            else:
                                highlights.append({
                                    "page": page_num + 1,
                                    "text": cleaned_chunk,
                                    "category": category,
                                    "comment": comment,
                                    "rect":annot.rect
                                })

        pdf_document.close()
        return highlights
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return []


def main():
    # Example usage
    pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
    highlights = extract_highlights(pdf_path)
    
    # Create markdown content
    markdown_content = "# PDF Highlights Summary\n\n"
    markdown_content += f"Total highlights found: {len(highlights)}\n\n"
    
    # Group highlights by category
    highlights_by_category = {}
    for highlight in highlights:
        category = highlight['category']
        if category not in highlights_by_category:
            highlights_by_category[category] = []
        highlights_by_category[category].append(highlight)
    
    # Add highlights grouped by category
    for category, category_highlights in highlights_by_category.items():
        markdown_content += f"## {category}\n\n"
        for highlight in category_highlights:
            markdown_content += f"### Page {highlight['page']}\n\n"
            markdown_content += f"{highlight['text']}\n\n"
            if highlight['comment']:
                markdown_content += f"> {highlight['comment']}\n\n"
        markdown_content += "---\n\n"
    
    # Save to markdown file
    output_file = "highlights_summary.md"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown_content)
    
    print(f"Highlights summary has been saved to {output_file}")


if __name__ == "__main__":
    main()