import PyPDF2 from typing import Dict, List, Tuple, Optional, Any import os import re import tempfile import fitz # PyMuPDF import base64 class PDFProcessor: """ Utility for processing PDF documents to extract text and analyze content. """ def __init__(self, pdf_path: str): """ Initialize the PDF processor. Args: pdf_path: Path to the PDF file """ self.pdf_path = pdf_path self.text_by_page = {} self.total_pages = 0 self._extract_text() def _extract_text(self) -> None: """Extract text from each page of the PDF.""" try: with open(self.pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) self.total_pages = len(reader.pages) for i in range(self.total_pages): page = reader.pages[i] self.text_by_page[i + 1] = page.extract_text() except Exception as e: print(f"Error extracting text from PDF: {e}") self.text_by_page = {} self.total_pages = 0 def get_text(self, page_num: Optional[int] = None) -> str: """ Get extracted text from the PDF. Args: page_num: If provided, returns text from specific page; otherwise returns all text Returns: Extracted text """ if page_num is not None: return self.text_by_page.get(page_num, "") return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)]) def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]: """ Find locations of text in the PDF. Args: text: Text to find page_num: If provided, searches only on specific page Returns: List of locations where text was found """ results = [] pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1) for page in pages_to_search: page_text = self.text_by_page.get(page, "") if not page_text: continue start_idx = 0 while True: idx = page_text.find(text, start_idx) if idx == -1: break results.append({ "page": page, "start_index": idx, "end_index": idx + len(text), "context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)] }) start_idx = idx + 1 return results def extract_citations(self) -> List[Dict[str, Any]]: """ Extract potential citations from the PDF using pattern matching. Returns: List of potential citations with page numbers """ # Simple regex patterns for common citation formats patterns = [ r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)', # (Author, Year) r'\[(\d+)\]', # [1] r'(\d+\.\s*[A-Z][^.]+\.)', # Numbered references ] results = [] for page_num in range(1, self.total_pages + 1): page_text = self.text_by_page.get(page_num, "") for pattern in patterns: matches = re.finditer(pattern, page_text) for match in matches: results.append({ "citation": match.group(0), "text": match.group(1), "page": page_num, "start_index": match.start(), "end_index": match.end(), "context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)] }) return results def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str: """ Create a new PDF with highlighted citations. Args: citation_locations: List of citation locations to highlight Returns: Path to the highlighted PDF """ # Open the PDF with PyMuPDF doc = fitz.open(self.pdf_path) # Sort citations by page citations_by_page = {} for citation in citation_locations: page_num = citation.get("page", 1) - 1 # PyMuPDF uses 0-indexed pages if page_num not in citations_by_page: citations_by_page[page_num] = [] citations_by_page[page_num].append(citation) # Highlight each citation for page_num, citations in citations_by_page.items(): if page_num >= len(doc): continue page = doc[page_num] for citation in citations: # Get the text to search for (use a small context to ensure accuracy) search_text = citation.get("text", "") if not search_text: continue # Find all instances of the citation text in the page text_instances = page.search_for(search_text) # Highlight each instance for inst in text_instances: # Create a yellow highlight annotation highlight = page.add_highlight_annot(inst) # Add metadata highlight.set_info({ "title": f"Citation {citation.get('citation_index', '')}", "content": f"Source: {citation.get('source_text', '')}" }) # Save the highlighted PDF to a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") temp_file.close() doc.save(temp_file.name) doc.close() return temp_file.name def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]: """ Generate thumbnails for the first few pages of the PDF. Args: max_pages: Maximum number of pages to generate thumbnails for Returns: List of page thumbnails as data URIs """ thumbnails = [] try: doc = fitz.open(self.pdf_path) pages_to_process = min(max_pages, len(doc)) for page_num in range(pages_to_process): page = doc[page_num] # Render page to an image pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # Reduced size for thumbnails # Convert to data URI img_data = pix.tobytes("png") b64_data = base64.b64encode(img_data).decode() data_uri = f"data:image/png;base64,{b64_data}" thumbnails.append({ "page": page_num + 1, "thumbnail": data_uri }) doc.close() except Exception as e: print(f"Error generating thumbnails: {e}") return thumbnails