Spaces:

mmrech
/

citation-interpreter

Sleeping

File size: 7,501 Bytes

9c6c358

import PyPDF2
from typing import Dict, List, Tuple, Optional, Any
import os
import re
import tempfile
import fitz  # PyMuPDF
import base64

class PDFProcessor:
    """
    Utility for processing PDF documents to extract text and analyze content.
    """
    def __init__(self, pdf_path: str):
        """
        Initialize the PDF processor.
        
        Args:
            pdf_path: Path to the PDF file
        """
        self.pdf_path = pdf_path
        self.text_by_page = {}
        self.total_pages = 0
        self._extract_text()
    
    def _extract_text(self) -> None:
        """Extract text from each page of the PDF."""
        try:
            with open(self.pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                self.total_pages = len(reader.pages)
                
                for i in range(self.total_pages):
                    page = reader.pages[i]
                    self.text_by_page[i + 1] = page.extract_text()
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            self.text_by_page = {}
            self.total_pages = 0
    
    def get_text(self, page_num: Optional[int] = None) -> str:
        """
        Get extracted text from the PDF.
        
        Args:
            page_num: If provided, returns text from specific page; otherwise returns all text
            
        Returns:
            Extracted text
        """
        if page_num is not None:
            return self.text_by_page.get(page_num, "")
        
        return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)])
    
    def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]:
        """
        Find locations of text in the PDF.
        
        Args:
            text: Text to find
            page_num: If provided, searches only on specific page
            
        Returns:
            List of locations where text was found
        """
        results = []
        pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1)
        
        for page in pages_to_search:
            page_text = self.text_by_page.get(page, "")
            if not page_text:
                continue
                
            start_idx = 0
            while True:
                idx = page_text.find(text, start_idx)
                if idx == -1:
                    break
                    
                results.append({
                    "page": page,
                    "start_index": idx,
                    "end_index": idx + len(text),
                    "context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)]
                })
                
                start_idx = idx + 1
                
        return results
    
    def extract_citations(self) -> List[Dict[str, Any]]:
        """
        Extract potential citations from the PDF using pattern matching.
        
        Returns:
            List of potential citations with page numbers
        """
        # Simple regex patterns for common citation formats
        patterns = [
            r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)',  # (Author, Year)
            r'\[(\d+)\]',  # [1]
            r'(\d+\.\s*[A-Z][^.]+\.)',  # Numbered references
        ]
        
        results = []
        
        for page_num in range(1, self.total_pages + 1):
            page_text = self.text_by_page.get(page_num, "")
            
            for pattern in patterns:
                matches = re.finditer(pattern, page_text)
                for match in matches:
                    results.append({
                        "citation": match.group(0),
                        "text": match.group(1),
                        "page": page_num,
                        "start_index": match.start(),
                        "end_index": match.end(),
                        "context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)]
                    })
        
        return results
    
    def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str:
        """
        Create a new PDF with highlighted citations.
        
        Args:
            citation_locations: List of citation locations to highlight
            
        Returns:
            Path to the highlighted PDF
        """
        # Open the PDF with PyMuPDF
        doc = fitz.open(self.pdf_path)
        
        # Sort citations by page
        citations_by_page = {}
        for citation in citation_locations:
            page_num = citation.get("page", 1) - 1  # PyMuPDF uses 0-indexed pages
            if page_num not in citations_by_page:
                citations_by_page[page_num] = []
            citations_by_page[page_num].append(citation)
        
        # Highlight each citation
        for page_num, citations in citations_by_page.items():
            if page_num >= len(doc):
                continue
                
            page = doc[page_num]
            
            for citation in citations:
                # Get the text to search for (use a small context to ensure accuracy)
                search_text = citation.get("text", "")
                if not search_text:
                    continue
                
                # Find all instances of the citation text in the page
                text_instances = page.search_for(search_text)
                
                # Highlight each instance
                for inst in text_instances:
                    # Create a yellow highlight annotation
                    highlight = page.add_highlight_annot(inst)
                    # Add metadata
                    highlight.set_info({
                        "title": f"Citation {citation.get('citation_index', '')}",
                        "content": f"Source: {citation.get('source_text', '')}"
                    })
        
        # Save the highlighted PDF to a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        temp_file.close()
        doc.save(temp_file.name)
        doc.close()
        
        return temp_file.name
    
    def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]:
        """
        Generate thumbnails for the first few pages of the PDF.
        
        Args:
            max_pages: Maximum number of pages to generate thumbnails for
            
        Returns:
            List of page thumbnails as data URIs
        """
        thumbnails = []
        
        try:
            doc = fitz.open(self.pdf_path)
            pages_to_process = min(max_pages, len(doc))
            
            for page_num in range(pages_to_process):
                page = doc[page_num]
                # Render page to an image
                pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))  # Reduced size for thumbnails
                
                # Convert to data URI
                img_data = pix.tobytes("png")
                b64_data = base64.b64encode(img_data).decode()
                data_uri = f"data:image/png;base64,{b64_data}"
                
                thumbnails.append({
                    "page": page_num + 1,
                    "thumbnail": data_uri
                })
                
            doc.close()
        except Exception as e:
            print(f"Error generating thumbnails: {e}")
        
        return thumbnails