Spaces:
Sleeping
Sleeping
File size: 6,225 Bytes
0bad002 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import fitz # PyMuPDF
import numpy as np
import re
from utils.decompose import Decomposer
def clean_chinese_text(text):
"""Clean up text by removing spaces between Chinese characters."""
# Remove spaces between Chinese characters
text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
# Remove spaces before and after Chinese punctuation
text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
return text.strip()
def categorize_highlight(color):
"""Categorizes highlights based on the closest color match using Euclidean distance."""
# customize the categories of highlights as you
color_mapping = {
(0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue
(1.0, 0.9412, 0.4): "General Notes", # Yellow
(0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green
(0.9686, 0.6, 0.8196): "Quotes & References", # Pink
(0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red
}
# Convert color to a NumPy array for distance calculation
color_array = np.array(color)
# Find the closest color in the mapping using Euclidean distance
best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)
return color_mapping[best_match]
def clean_text_by_punctuation(text):
"""Clean text by removing content after the last proper punctuation mark."""
# Define proper ending punctuation marks (both Chinese and English)
ending_punctuation = {'.', '。', '?', '?', '!', '!'}
# Find the last occurrence of any ending punctuation
last_punct_index = -1
for i, char in enumerate(text):
if char in ending_punctuation:
last_punct_index = i
# If no proper ending punctuation found, return empty string
if last_punct_index == -1:
return ""
# Return text up to and including the last punctuation mark
return text[:last_punct_index + 1]
def extract_highlights(pdf_path):
"""
Extract all highlights from a PDF file.
Args:
pdf_path (str): Path to the PDF file
Returns:
list: List of dictionaries containing highlight information
"""
highlights = []
try:
pdf_document = fitz.open(pdf_path)
min_y, max_y, single_y = Decomposer(pdf_document).run()
highlights=[]
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
for annot in page.annots():
if annot.type[0] == 8: # Highlight annotation
# Extract highlighted text
highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
# Extract annotation color
color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined
category = categorize_highlight(color_rgb)
# Extract popup comment if it exists
comment = annot.info.get("content", "").strip() if annot.has_popup else ""
# Store structured highlight data
if highlight_text:
for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
if not cleaned_chunk: # Skip if no valid text after cleaning
continue
if (highlights and
highlights[-1]['page'] == page_num and
highlights[-1]['rect'][3] > (max_y - single_y) and
annot.rect[1]< (min_y + single_y)
): # Handle highlights over page
highlights[-1]['text'] += cleaned_chunk
else:
highlights.append({
"page": page_num + 1,
"text": cleaned_chunk,
"category": category,
"comment": comment,
"rect":annot.rect
})
pdf_document.close()
return highlights
except Exception as e:
print(f"Error processing PDF: {str(e)}")
return []
def main():
# Example usage
pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
highlights = extract_highlights(pdf_path)
# Create markdown content
markdown_content = "# PDF Highlights Summary\n\n"
markdown_content += f"Total highlights found: {len(highlights)}\n\n"
# Group highlights by category
highlights_by_category = {}
for highlight in highlights:
category = highlight['category']
if category not in highlights_by_category:
highlights_by_category[category] = []
highlights_by_category[category].append(highlight)
# Add highlights grouped by category
for category, category_highlights in highlights_by_category.items():
markdown_content += f"## {category}\n\n"
for highlight in category_highlights:
markdown_content += f"### Page {highlight['page']}\n\n"
markdown_content += f"{highlight['text']}\n\n"
if highlight['comment']:
markdown_content += f"> {highlight['comment']}\n\n"
markdown_content += "---\n\n"
# Save to markdown file
output_file = "highlights_summary.md"
with open(output_file, "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"Highlights summary has been saved to {output_file}")
if __name__ == "__main__":
main()
|