fincatch-ocr / src /PDF_highlight_extractor.py
gnlui's picture
initial
0bad002
import fitz # PyMuPDF
import numpy as np
import re
from utils.decompose import Decomposer
def clean_chinese_text(text):
"""Clean up text by removing spaces between Chinese characters."""
# Remove spaces between Chinese characters
text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
# Remove spaces before and after Chinese punctuation
text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
return text.strip()
def categorize_highlight(color):
"""Categorizes highlights based on the closest color match using Euclidean distance."""
# customize the categories of highlights as you
color_mapping = {
(0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue
(1.0, 0.9412, 0.4): "General Notes", # Yellow
(0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green
(0.9686, 0.6, 0.8196): "Quotes & References", # Pink
(0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red
}
# Convert color to a NumPy array for distance calculation
color_array = np.array(color)
# Find the closest color in the mapping using Euclidean distance
best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)
return color_mapping[best_match]
def clean_text_by_punctuation(text):
"""Clean text by removing content after the last proper punctuation mark."""
# Define proper ending punctuation marks (both Chinese and English)
ending_punctuation = {'.', '。', '?', '?', '!', '!'}
# Find the last occurrence of any ending punctuation
last_punct_index = -1
for i, char in enumerate(text):
if char in ending_punctuation:
last_punct_index = i
# If no proper ending punctuation found, return empty string
if last_punct_index == -1:
return ""
# Return text up to and including the last punctuation mark
return text[:last_punct_index + 1]
def extract_highlights(pdf_path):
"""
Extract all highlights from a PDF file.
Args:
pdf_path (str): Path to the PDF file
Returns:
list: List of dictionaries containing highlight information
"""
highlights = []
try:
pdf_document = fitz.open(pdf_path)
min_y, max_y, single_y = Decomposer(pdf_document).run()
highlights=[]
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
for annot in page.annots():
if annot.type[0] == 8: # Highlight annotation
# Extract highlighted text
highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
# Extract annotation color
color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined
category = categorize_highlight(color_rgb)
# Extract popup comment if it exists
comment = annot.info.get("content", "").strip() if annot.has_popup else ""
# Store structured highlight data
if highlight_text:
for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
if not cleaned_chunk: # Skip if no valid text after cleaning
continue
if (highlights and
highlights[-1]['page'] == page_num and
highlights[-1]['rect'][3] > (max_y - single_y) and
annot.rect[1]< (min_y + single_y)
): # Handle highlights over page
highlights[-1]['text'] += cleaned_chunk
else:
highlights.append({
"page": page_num + 1,
"text": cleaned_chunk,
"category": category,
"comment": comment,
"rect":annot.rect
})
pdf_document.close()
return highlights
except Exception as e:
print(f"Error processing PDF: {str(e)}")
return []
def main():
# Example usage
pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
highlights = extract_highlights(pdf_path)
# Create markdown content
markdown_content = "# PDF Highlights Summary\n\n"
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
# Group highlights by category
highlights_by_category = {}
for highlight in highlights:
category = highlight['category']
if category not in highlights_by_category:
highlights_by_category[category] = []
highlights_by_category[category].append(highlight)
# Add highlights grouped by category
for category, category_highlights in highlights_by_category.items():
markdown_content += f"## {category}\n\n"
for highlight in category_highlights:
markdown_content += f"### Page {highlight['page']}\n\n"
markdown_content += f"{highlight['text']}\n\n"
if highlight['comment']:
markdown_content += f"> {highlight['comment']}\n\n"
markdown_content += "---\n\n"
# Save to markdown file
output_file = "highlights_summary.md"
with open(output_file, "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"Highlights summary has been saved to {output_file}")
if __name__ == "__main__":
main()