Spaces:

gnlui
/

fincatch-ocr

Sleeping

App Files Files Community

fincatch-ocr / src /PDF_highlight_extractor.py

gnlui

initial

0bad002 7 months ago

raw

history blame contribute delete

6.23 kB

	import fitz # PyMuPDF
	import numpy as np
	import re
	from utils.decompose import Decomposer

	def clean_chinese_text(text):
	"""Clean up text by removing spaces between Chinese characters."""
	# Remove spaces between Chinese characters
	text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
	# Remove spaces before and after Chinese punctuation
	text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
	text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
	return text.strip()


	def categorize_highlight(color):
	"""Categorizes highlights based on the closest color match using Euclidean distance."""
	# customize the categories of highlights as you
	color_mapping = {
	(0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue
	(1.0, 0.9412, 0.4): "General Notes", # Yellow
	(0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green
	(0.9686, 0.6, 0.8196): "Quotes & References", # Pink
	(0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red
	}

	# Convert color to a NumPy array for distance calculation
	color_array = np.array(color)

	# Find the closest color in the mapping using Euclidean distance
	best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) 2 for i in range(len(color_array))) 0.5)

	return color_mapping[best_match]


	def clean_text_by_punctuation(text):
	"""Clean text by removing content after the last proper punctuation mark."""
	# Define proper ending punctuation marks (both Chinese and English)
	ending_punctuation = {'.', '。', '?', '？', '!', '！'}

	# Find the last occurrence of any ending punctuation
	last_punct_index = -1
	for i, char in enumerate(text):
	if char in ending_punctuation:
	last_punct_index = i

	# If no proper ending punctuation found, return empty string
	if last_punct_index == -1:
	return ""

	# Return text up to and including the last punctuation mark
	return text[:last_punct_index + 1]


	def extract_highlights(pdf_path):
	"""
	Extract all highlights from a PDF file.

	Args:
	pdf_path (str): Path to the PDF file

	Returns:
	list: List of dictionaries containing highlight information
	"""
	highlights = []
	try:
	pdf_document = fitz.open(pdf_path)

	min_y, max_y, single_y = Decomposer(pdf_document).run()
	highlights=[]
	for page_num in range(pdf_document.page_count):
	page = pdf_document[page_num]

	for annot in page.annots():
	if annot.type[0] == 8: # Highlight annotation
	# Extract highlighted text
	highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
	highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")

	# Extract annotation color
	color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined
	category = categorize_highlight(color_rgb)

	# Extract popup comment if it exists
	comment = annot.info.get("content", "").strip() if annot.has_popup else ""
	# Store structured highlight data
	if highlight_text:
	for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
	cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
	if not cleaned_chunk: # Skip if no valid text after cleaning
	continue

	if (highlights and
	highlights[-1]['page'] == page_num and
	highlights[-1]['rect'][3] > (max_y - single_y) and
	annot.rect[1]< (min_y + single_y)
	): # Handle highlights over page
	highlights[-1]['text'] += cleaned_chunk

	else:
	highlights.append({
	"page": page_num + 1,
	"text": cleaned_chunk,
	"category": category,
	"comment": comment,
	"rect":annot.rect
	})

	pdf_document.close()
	return highlights
	except Exception as e:
	print(f"Error processing PDF: {str(e)}")
	return []


	def main():
	# Example usage
	pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
	highlights = extract_highlights(pdf_path)

	# Create markdown content
	markdown_content = "# PDF Highlights Summary\n\n"
	markdown_content += f"Total highlights found: {len(highlights)}\n\n"

	# Group highlights by category
	highlights_by_category = {}
	for highlight in highlights:
	category = highlight['category']
	if category not in highlights_by_category:
	highlights_by_category[category] = []
	highlights_by_category[category].append(highlight)

	# Add highlights grouped by category
	for category, category_highlights in highlights_by_category.items():
	markdown_content += f"## {category}\n\n"
	for highlight in category_highlights:
	markdown_content += f"### Page {highlight['page']}\n\n"
	markdown_content += f"{highlight['text']}\n\n"
	if highlight['comment']:
	markdown_content += f"> {highlight['comment']}\n\n"
	markdown_content += "---\n\n"

	# Save to markdown file
	output_file = "highlights_summary.md"
	with open(output_file, "w", encoding="utf-8") as f:
	f.write(markdown_content)

	print(f"Highlights summary has been saved to {output_file}")


	if __name__ == "__main__":
	main()