import fitz # PyMuPDF import numpy as np import re from utils.decompose import Decomposer def clean_chinese_text(text): """Clean up text by removing spaces between Chinese characters.""" # Remove spaces between Chinese characters text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text) # Remove spaces before and after Chinese punctuation text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text) text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text) return text.strip() def categorize_highlight(color): """Categorizes highlights based on the closest color match using Euclidean distance.""" # customize the categories of highlights as you color_mapping = { (0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue (1.0, 0.9412, 0.4): "General Notes", # Yellow (0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green (0.9686, 0.6, 0.8196): "Quotes & References", # Pink (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red } # Convert color to a NumPy array for distance calculation color_array = np.array(color) # Find the closest color in the mapping using Euclidean distance best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5) return color_mapping[best_match] def clean_text_by_punctuation(text): """Clean text by removing content after the last proper punctuation mark.""" # Define proper ending punctuation marks (both Chinese and English) ending_punctuation = {'.', '。', '?', '?', '!', '!'} # Find the last occurrence of any ending punctuation last_punct_index = -1 for i, char in enumerate(text): if char in ending_punctuation: last_punct_index = i # If no proper ending punctuation found, return empty string if last_punct_index == -1: return "" # Return text up to and including the last punctuation mark return text[:last_punct_index + 1] def extract_highlights(pdf_path): """ Extract all highlights from a PDF file. Args: pdf_path (str): Path to the PDF file Returns: list: List of dictionaries containing highlight information """ highlights = [] try: pdf_document = fitz.open(pdf_path) min_y, max_y, single_y = Decomposer(pdf_document).run() highlights=[] for page_num in range(pdf_document.page_count): page = pdf_document[page_num] for annot in page.annots(): if annot.type[0] == 8: # Highlight annotation # Extract highlighted text highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip() highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8") # Extract annotation color color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined category = categorize_highlight(color_rgb) # Extract popup comment if it exists comment = annot.info.get("content", "").strip() if annot.has_popup else "" # Store structured highlight data if highlight_text: for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk)) if not cleaned_chunk: # Skip if no valid text after cleaning continue if (highlights and highlights[-1]['page'] == page_num and highlights[-1]['rect'][3] > (max_y - single_y) and annot.rect[1]< (min_y + single_y) ): # Handle highlights over page highlights[-1]['text'] += cleaned_chunk else: highlights.append({ "page": page_num + 1, "text": cleaned_chunk, "category": category, "comment": comment, "rect":annot.rect }) pdf_document.close() return highlights except Exception as e: print(f"Error processing PDF: {str(e)}") return [] def main(): # Example usage pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf" highlights = extract_highlights(pdf_path) # Create markdown content markdown_content = "# PDF Highlights Summary\n\n" markdown_content += f"Total highlights found: {len(highlights)}\n\n" # Group highlights by category highlights_by_category = {} for highlight in highlights: category = highlight['category'] if category not in highlights_by_category: highlights_by_category[category] = [] highlights_by_category[category].append(highlight) # Add highlights grouped by category for category, category_highlights in highlights_by_category.items(): markdown_content += f"## {category}\n\n" for highlight in category_highlights: markdown_content += f"### Page {highlight['page']}\n\n" markdown_content += f"{highlight['text']}\n\n" if highlight['comment']: markdown_content += f"> {highlight['comment']}\n\n" markdown_content += "---\n\n" # Save to markdown file output_file = "highlights_summary.md" with open(output_file, "w", encoding="utf-8") as f: f.write(markdown_content) print(f"Highlights summary has been saved to {output_file}") if __name__ == "__main__": main()