Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import numpy as np | |
| import re | |
| from utils.decompose import Decomposer | |
| def clean_chinese_text(text): | |
| """Clean up text by removing spaces between Chinese characters.""" | |
| # Remove spaces between Chinese characters | |
| text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text) | |
| # Remove spaces before and after Chinese punctuation | |
| text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text) | |
| text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text) | |
| return text.strip() | |
| def categorize_highlight(color): | |
| """Categorizes highlights based on the closest color match using Euclidean distance.""" | |
| # customize the categories of highlights as you | |
| color_mapping = { | |
| (0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue | |
| (1.0, 0.9412, 0.4): "General Notes", # Yellow | |
| (0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green | |
| (0.9686, 0.6, 0.8196): "Quotes & References", # Pink | |
| (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red | |
| } | |
| # Convert color to a NumPy array for distance calculation | |
| color_array = np.array(color) | |
| # Find the closest color in the mapping using Euclidean distance | |
| best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5) | |
| return color_mapping[best_match] | |
| def clean_text_by_punctuation(text): | |
| """Clean text by removing content after the last proper punctuation mark.""" | |
| # Define proper ending punctuation marks (both Chinese and English) | |
| ending_punctuation = {'.', '。', '?', '?', '!', '!'} | |
| # Find the last occurrence of any ending punctuation | |
| last_punct_index = -1 | |
| for i, char in enumerate(text): | |
| if char in ending_punctuation: | |
| last_punct_index = i | |
| # If no proper ending punctuation found, return empty string | |
| if last_punct_index == -1: | |
| return "" | |
| # Return text up to and including the last punctuation mark | |
| return text[:last_punct_index + 1] | |
| def extract_highlights(pdf_path): | |
| """ | |
| Extract all highlights from a PDF file. | |
| Args: | |
| pdf_path (str): Path to the PDF file | |
| Returns: | |
| list: List of dictionaries containing highlight information | |
| """ | |
| highlights = [] | |
| try: | |
| pdf_document = fitz.open(pdf_path) | |
| min_y, max_y, single_y = Decomposer(pdf_document).run() | |
| highlights=[] | |
| for page_num in range(pdf_document.page_count): | |
| page = pdf_document[page_num] | |
| for annot in page.annots(): | |
| if annot.type[0] == 8: # Highlight annotation | |
| # Extract highlighted text | |
| highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip() | |
| highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8") | |
| # Extract annotation color | |
| color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined | |
| category = categorize_highlight(color_rgb) | |
| # Extract popup comment if it exists | |
| comment = annot.info.get("content", "").strip() if annot.has_popup else "" | |
| # Store structured highlight data | |
| if highlight_text: | |
| for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights | |
| cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk)) | |
| if not cleaned_chunk: # Skip if no valid text after cleaning | |
| continue | |
| if (highlights and | |
| highlights[-1]['page'] == page_num and | |
| highlights[-1]['rect'][3] > (max_y - single_y) and | |
| annot.rect[1]< (min_y + single_y) | |
| ): # Handle highlights over page | |
| highlights[-1]['text'] += cleaned_chunk | |
| else: | |
| highlights.append({ | |
| "page": page_num + 1, | |
| "text": cleaned_chunk, | |
| "category": category, | |
| "comment": comment, | |
| "rect":annot.rect | |
| }) | |
| pdf_document.close() | |
| return highlights | |
| except Exception as e: | |
| print(f"Error processing PDF: {str(e)}") | |
| return [] | |
| def main(): | |
| # Example usage | |
| pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf" | |
| highlights = extract_highlights(pdf_path) | |
| # Create markdown content | |
| markdown_content = "# PDF Highlights Summary\n\n" | |
| markdown_content += f"Total highlights found: {len(highlights)}\n\n" | |
| # Group highlights by category | |
| highlights_by_category = {} | |
| for highlight in highlights: | |
| category = highlight['category'] | |
| if category not in highlights_by_category: | |
| highlights_by_category[category] = [] | |
| highlights_by_category[category].append(highlight) | |
| # Add highlights grouped by category | |
| for category, category_highlights in highlights_by_category.items(): | |
| markdown_content += f"## {category}\n\n" | |
| for highlight in category_highlights: | |
| markdown_content += f"### Page {highlight['page']}\n\n" | |
| markdown_content += f"{highlight['text']}\n\n" | |
| if highlight['comment']: | |
| markdown_content += f"> {highlight['comment']}\n\n" | |
| markdown_content += "---\n\n" | |
| # Save to markdown file | |
| output_file = "highlights_summary.md" | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| f.write(markdown_content) | |
| print(f"Highlights summary has been saved to {output_file}") | |
| if __name__ == "__main__": | |
| main() | |