Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from typing import Dict, List, Tuple, Optional, Any | |
| import os | |
| import re | |
| import tempfile | |
| import fitz # PyMuPDF | |
| import base64 | |
| class PDFProcessor: | |
| """ | |
| Utility for processing PDF documents to extract text and analyze content. | |
| """ | |
| def __init__(self, pdf_path: str): | |
| """ | |
| Initialize the PDF processor. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| """ | |
| self.pdf_path = pdf_path | |
| self.text_by_page = {} | |
| self.total_pages = 0 | |
| self._extract_text() | |
| def _extract_text(self) -> None: | |
| """Extract text from each page of the PDF.""" | |
| try: | |
| with open(self.pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| self.total_pages = len(reader.pages) | |
| for i in range(self.total_pages): | |
| page = reader.pages[i] | |
| self.text_by_page[i + 1] = page.extract_text() | |
| except Exception as e: | |
| print(f"Error extracting text from PDF: {e}") | |
| self.text_by_page = {} | |
| self.total_pages = 0 | |
| def get_text(self, page_num: Optional[int] = None) -> str: | |
| """ | |
| Get extracted text from the PDF. | |
| Args: | |
| page_num: If provided, returns text from specific page; otherwise returns all text | |
| Returns: | |
| Extracted text | |
| """ | |
| if page_num is not None: | |
| return self.text_by_page.get(page_num, "") | |
| return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)]) | |
| def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]: | |
| """ | |
| Find locations of text in the PDF. | |
| Args: | |
| text: Text to find | |
| page_num: If provided, searches only on specific page | |
| Returns: | |
| List of locations where text was found | |
| """ | |
| results = [] | |
| pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1) | |
| for page in pages_to_search: | |
| page_text = self.text_by_page.get(page, "") | |
| if not page_text: | |
| continue | |
| start_idx = 0 | |
| while True: | |
| idx = page_text.find(text, start_idx) | |
| if idx == -1: | |
| break | |
| results.append({ | |
| "page": page, | |
| "start_index": idx, | |
| "end_index": idx + len(text), | |
| "context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)] | |
| }) | |
| start_idx = idx + 1 | |
| return results | |
| def extract_citations(self) -> List[Dict[str, Any]]: | |
| """ | |
| Extract potential citations from the PDF using pattern matching. | |
| Returns: | |
| List of potential citations with page numbers | |
| """ | |
| # Simple regex patterns for common citation formats | |
| patterns = [ | |
| r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)', # (Author, Year) | |
| r'\[(\d+)\]', # [1] | |
| r'(\d+\.\s*[A-Z][^.]+\.)', # Numbered references | |
| ] | |
| results = [] | |
| for page_num in range(1, self.total_pages + 1): | |
| page_text = self.text_by_page.get(page_num, "") | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, page_text) | |
| for match in matches: | |
| results.append({ | |
| "citation": match.group(0), | |
| "text": match.group(1), | |
| "page": page_num, | |
| "start_index": match.start(), | |
| "end_index": match.end(), | |
| "context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)] | |
| }) | |
| return results | |
| def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str: | |
| """ | |
| Create a new PDF with highlighted citations. | |
| Args: | |
| citation_locations: List of citation locations to highlight | |
| Returns: | |
| Path to the highlighted PDF | |
| """ | |
| # Open the PDF with PyMuPDF | |
| doc = fitz.open(self.pdf_path) | |
| # Sort citations by page | |
| citations_by_page = {} | |
| for citation in citation_locations: | |
| page_num = citation.get("page", 1) - 1 # PyMuPDF uses 0-indexed pages | |
| if page_num not in citations_by_page: | |
| citations_by_page[page_num] = [] | |
| citations_by_page[page_num].append(citation) | |
| # Highlight each citation | |
| for page_num, citations in citations_by_page.items(): | |
| if page_num >= len(doc): | |
| continue | |
| page = doc[page_num] | |
| for citation in citations: | |
| # Get the text to search for (use a small context to ensure accuracy) | |
| search_text = citation.get("text", "") | |
| if not search_text: | |
| continue | |
| # Find all instances of the citation text in the page | |
| text_instances = page.search_for(search_text) | |
| # Highlight each instance | |
| for inst in text_instances: | |
| # Create a yellow highlight annotation | |
| highlight = page.add_highlight_annot(inst) | |
| # Add metadata | |
| highlight.set_info({ | |
| "title": f"Citation {citation.get('citation_index', '')}", | |
| "content": f"Source: {citation.get('source_text', '')}" | |
| }) | |
| # Save the highlighted PDF to a temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") | |
| temp_file.close() | |
| doc.save(temp_file.name) | |
| doc.close() | |
| return temp_file.name | |
| def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Generate thumbnails for the first few pages of the PDF. | |
| Args: | |
| max_pages: Maximum number of pages to generate thumbnails for | |
| Returns: | |
| List of page thumbnails as data URIs | |
| """ | |
| thumbnails = [] | |
| try: | |
| doc = fitz.open(self.pdf_path) | |
| pages_to_process = min(max_pages, len(doc)) | |
| for page_num in range(pages_to_process): | |
| page = doc[page_num] | |
| # Render page to an image | |
| pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # Reduced size for thumbnails | |
| # Convert to data URI | |
| img_data = pix.tobytes("png") | |
| b64_data = base64.b64encode(img_data).decode() | |
| data_uri = f"data:image/png;base64,{b64_data}" | |
| thumbnails.append({ | |
| "page": page_num + 1, | |
| "thumbnail": data_uri | |
| }) | |
| doc.close() | |
| except Exception as e: | |
| print(f"Error generating thumbnails: {e}") | |
| return thumbnails |