citation-interpreter / utils /pdf_processor.py
mmrech's picture
Upload folder using huggingface_hub
9c6c358 verified
import PyPDF2
from typing import Dict, List, Tuple, Optional, Any
import os
import re
import tempfile
import fitz # PyMuPDF
import base64
class PDFProcessor:
"""
Utility for processing PDF documents to extract text and analyze content.
"""
def __init__(self, pdf_path: str):
"""
Initialize the PDF processor.
Args:
pdf_path: Path to the PDF file
"""
self.pdf_path = pdf_path
self.text_by_page = {}
self.total_pages = 0
self._extract_text()
def _extract_text(self) -> None:
"""Extract text from each page of the PDF."""
try:
with open(self.pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
self.total_pages = len(reader.pages)
for i in range(self.total_pages):
page = reader.pages[i]
self.text_by_page[i + 1] = page.extract_text()
except Exception as e:
print(f"Error extracting text from PDF: {e}")
self.text_by_page = {}
self.total_pages = 0
def get_text(self, page_num: Optional[int] = None) -> str:
"""
Get extracted text from the PDF.
Args:
page_num: If provided, returns text from specific page; otherwise returns all text
Returns:
Extracted text
"""
if page_num is not None:
return self.text_by_page.get(page_num, "")
return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)])
def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]:
"""
Find locations of text in the PDF.
Args:
text: Text to find
page_num: If provided, searches only on specific page
Returns:
List of locations where text was found
"""
results = []
pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1)
for page in pages_to_search:
page_text = self.text_by_page.get(page, "")
if not page_text:
continue
start_idx = 0
while True:
idx = page_text.find(text, start_idx)
if idx == -1:
break
results.append({
"page": page,
"start_index": idx,
"end_index": idx + len(text),
"context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)]
})
start_idx = idx + 1
return results
def extract_citations(self) -> List[Dict[str, Any]]:
"""
Extract potential citations from the PDF using pattern matching.
Returns:
List of potential citations with page numbers
"""
# Simple regex patterns for common citation formats
patterns = [
r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)', # (Author, Year)
r'\[(\d+)\]', # [1]
r'(\d+\.\s*[A-Z][^.]+\.)', # Numbered references
]
results = []
for page_num in range(1, self.total_pages + 1):
page_text = self.text_by_page.get(page_num, "")
for pattern in patterns:
matches = re.finditer(pattern, page_text)
for match in matches:
results.append({
"citation": match.group(0),
"text": match.group(1),
"page": page_num,
"start_index": match.start(),
"end_index": match.end(),
"context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)]
})
return results
def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str:
"""
Create a new PDF with highlighted citations.
Args:
citation_locations: List of citation locations to highlight
Returns:
Path to the highlighted PDF
"""
# Open the PDF with PyMuPDF
doc = fitz.open(self.pdf_path)
# Sort citations by page
citations_by_page = {}
for citation in citation_locations:
page_num = citation.get("page", 1) - 1 # PyMuPDF uses 0-indexed pages
if page_num not in citations_by_page:
citations_by_page[page_num] = []
citations_by_page[page_num].append(citation)
# Highlight each citation
for page_num, citations in citations_by_page.items():
if page_num >= len(doc):
continue
page = doc[page_num]
for citation in citations:
# Get the text to search for (use a small context to ensure accuracy)
search_text = citation.get("text", "")
if not search_text:
continue
# Find all instances of the citation text in the page
text_instances = page.search_for(search_text)
# Highlight each instance
for inst in text_instances:
# Create a yellow highlight annotation
highlight = page.add_highlight_annot(inst)
# Add metadata
highlight.set_info({
"title": f"Citation {citation.get('citation_index', '')}",
"content": f"Source: {citation.get('source_text', '')}"
})
# Save the highlighted PDF to a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_file.close()
doc.save(temp_file.name)
doc.close()
return temp_file.name
def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]:
"""
Generate thumbnails for the first few pages of the PDF.
Args:
max_pages: Maximum number of pages to generate thumbnails for
Returns:
List of page thumbnails as data URIs
"""
thumbnails = []
try:
doc = fitz.open(self.pdf_path)
pages_to_process = min(max_pages, len(doc))
for page_num in range(pages_to_process):
page = doc[page_num]
# Render page to an image
pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # Reduced size for thumbnails
# Convert to data URI
img_data = pix.tobytes("png")
b64_data = base64.b64encode(img_data).decode()
data_uri = f"data:image/png;base64,{b64_data}"
thumbnails.append({
"page": page_num + 1,
"thumbnail": data_uri
})
doc.close()
except Exception as e:
print(f"Error generating thumbnails: {e}")
return thumbnails