Spaces:

mmrech
/

citation-interpreter

Sleeping

App Files Files Community

citation-interpreter / utils /pdf_processor.py

mmrech

Upload folder using huggingface_hub

9c6c358 verified 11 months ago

raw

history blame contribute delete

7.5 kB

	import PyPDF2
	from typing import Dict, List, Tuple, Optional, Any
	import os
	import re
	import tempfile
	import fitz # PyMuPDF
	import base64

	class PDFProcessor:
	"""
	Utility for processing PDF documents to extract text and analyze content.
	"""
	def __init__(self, pdf_path: str):
	"""
	Initialize the PDF processor.

	Args:
	pdf_path: Path to the PDF file
	"""
	self.pdf_path = pdf_path
	self.text_by_page = {}
	self.total_pages = 0
	self._extract_text()

	def _extract_text(self) -> None:
	"""Extract text from each page of the PDF."""
	try:
	with open(self.pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	self.total_pages = len(reader.pages)

	for i in range(self.total_pages):
	page = reader.pages[i]
	self.text_by_page[i + 1] = page.extract_text()
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	self.text_by_page = {}
	self.total_pages = 0

	def get_text(self, page_num: Optional[int] = None) -> str:
	"""
	Get extracted text from the PDF.

	Args:
	page_num: If provided, returns text from specific page; otherwise returns all text

	Returns:
	Extracted text
	"""
	if page_num is not None:
	return self.text_by_page.get(page_num, "")

	return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)])

	def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]:
	"""
	Find locations of text in the PDF.

	Args:
	text: Text to find
	page_num: If provided, searches only on specific page

	Returns:
	List of locations where text was found
	"""
	results = []
	pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1)

	for page in pages_to_search:
	page_text = self.text_by_page.get(page, "")
	if not page_text:
	continue

	start_idx = 0
	while True:
	idx = page_text.find(text, start_idx)
	if idx == -1:
	break

	results.append({
	"page": page,
	"start_index": idx,
	"end_index": idx + len(text),
	"context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)]
	})

	start_idx = idx + 1

	return results

	def extract_citations(self) -> List[Dict[str, Any]]:
	"""
	Extract potential citations from the PDF using pattern matching.

	Returns:
	List of potential citations with page numbers
	"""
	# Simple regex patterns for common citation formats
	patterns = [
	r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)', # (Author, Year)
	r'\[(\d+)\]', # [1]
	r'(\d+\.\s*[A-Z][^.]+\.)', # Numbered references
	]

	results = []

	for page_num in range(1, self.total_pages + 1):
	page_text = self.text_by_page.get(page_num, "")

	for pattern in patterns:
	matches = re.finditer(pattern, page_text)
	for match in matches:
	results.append({
	"citation": match.group(0),
	"text": match.group(1),
	"page": page_num,
	"start_index": match.start(),
	"end_index": match.end(),
	"context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)]
	})

	return results

	def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str:
	"""
	Create a new PDF with highlighted citations.

	Args:
	citation_locations: List of citation locations to highlight

	Returns:
	Path to the highlighted PDF
	"""
	# Open the PDF with PyMuPDF
	doc = fitz.open(self.pdf_path)

	# Sort citations by page
	citations_by_page = {}
	for citation in citation_locations:
	page_num = citation.get("page", 1) - 1 # PyMuPDF uses 0-indexed pages
	if page_num not in citations_by_page:
	citations_by_page[page_num] = []
	citations_by_page[page_num].append(citation)

	# Highlight each citation
	for page_num, citations in citations_by_page.items():
	if page_num >= len(doc):
	continue

	page = doc[page_num]

	for citation in citations:
	# Get the text to search for (use a small context to ensure accuracy)
	search_text = citation.get("text", "")
	if not search_text:
	continue

	# Find all instances of the citation text in the page
	text_instances = page.search_for(search_text)

	# Highlight each instance
	for inst in text_instances:
	# Create a yellow highlight annotation
	highlight = page.add_highlight_annot(inst)
	# Add metadata
	highlight.set_info({
	"title": f"Citation {citation.get('citation_index', '')}",
	"content": f"Source: {citation.get('source_text', '')}"
	})

	# Save the highlighted PDF to a temporary file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
	temp_file.close()
	doc.save(temp_file.name)
	doc.close()

	return temp_file.name

	def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]:
	"""
	Generate thumbnails for the first few pages of the PDF.

	Args:
	max_pages: Maximum number of pages to generate thumbnails for

	Returns:
	List of page thumbnails as data URIs
	"""
	thumbnails = []

	try:
	doc = fitz.open(self.pdf_path)
	pages_to_process = min(max_pages, len(doc))

	for page_num in range(pages_to_process):
	page = doc[page_num]
	# Render page to an image
	pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # Reduced size for thumbnails

	# Convert to data URI
	img_data = pix.tobytes("png")
	b64_data = base64.b64encode(img_data).decode()
	data_uri = f"data:image/png;base64,{b64_data}"

	thumbnails.append({
	"page": page_num + 1,
	"thumbnail": data_uri
	})

	doc.close()
	except Exception as e:
	print(f"Error generating thumbnails: {e}")

	return thumbnails