Spaces:
Sleeping
Sleeping
File size: 7,501 Bytes
9c6c358 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import PyPDF2
from typing import Dict, List, Tuple, Optional, Any
import os
import re
import tempfile
import fitz # PyMuPDF
import base64
class PDFProcessor:
"""
Utility for processing PDF documents to extract text and analyze content.
"""
def __init__(self, pdf_path: str):
"""
Initialize the PDF processor.
Args:
pdf_path: Path to the PDF file
"""
self.pdf_path = pdf_path
self.text_by_page = {}
self.total_pages = 0
self._extract_text()
def _extract_text(self) -> None:
"""Extract text from each page of the PDF."""
try:
with open(self.pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
self.total_pages = len(reader.pages)
for i in range(self.total_pages):
page = reader.pages[i]
self.text_by_page[i + 1] = page.extract_text()
except Exception as e:
print(f"Error extracting text from PDF: {e}")
self.text_by_page = {}
self.total_pages = 0
def get_text(self, page_num: Optional[int] = None) -> str:
"""
Get extracted text from the PDF.
Args:
page_num: If provided, returns text from specific page; otherwise returns all text
Returns:
Extracted text
"""
if page_num is not None:
return self.text_by_page.get(page_num, "")
return "\n\n".join([self.text_by_page.get(i + 1, "") for i in range(self.total_pages)])
def find_text_location(self, text: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]:
"""
Find locations of text in the PDF.
Args:
text: Text to find
page_num: If provided, searches only on specific page
Returns:
List of locations where text was found
"""
results = []
pages_to_search = [page_num] if page_num else range(1, self.total_pages + 1)
for page in pages_to_search:
page_text = self.text_by_page.get(page, "")
if not page_text:
continue
start_idx = 0
while True:
idx = page_text.find(text, start_idx)
if idx == -1:
break
results.append({
"page": page,
"start_index": idx,
"end_index": idx + len(text),
"context": page_text[max(0, idx - 50):min(len(page_text), idx + len(text) + 50)]
})
start_idx = idx + 1
return results
def extract_citations(self) -> List[Dict[str, Any]]:
"""
Extract potential citations from the PDF using pattern matching.
Returns:
List of potential citations with page numbers
"""
# Simple regex patterns for common citation formats
patterns = [
r'\(([A-Za-z]+,\s*\d{4}[a-z]?)\)', # (Author, Year)
r'\[(\d+)\]', # [1]
r'(\d+\.\s*[A-Z][^.]+\.)', # Numbered references
]
results = []
for page_num in range(1, self.total_pages + 1):
page_text = self.text_by_page.get(page_num, "")
for pattern in patterns:
matches = re.finditer(pattern, page_text)
for match in matches:
results.append({
"citation": match.group(0),
"text": match.group(1),
"page": page_num,
"start_index": match.start(),
"end_index": match.end(),
"context": page_text[max(0, match.start() - 50):min(len(page_text), match.end() + 50)]
})
return results
def highlight_pdf(self, citation_locations: List[Dict[str, Any]]) -> str:
"""
Create a new PDF with highlighted citations.
Args:
citation_locations: List of citation locations to highlight
Returns:
Path to the highlighted PDF
"""
# Open the PDF with PyMuPDF
doc = fitz.open(self.pdf_path)
# Sort citations by page
citations_by_page = {}
for citation in citation_locations:
page_num = citation.get("page", 1) - 1 # PyMuPDF uses 0-indexed pages
if page_num not in citations_by_page:
citations_by_page[page_num] = []
citations_by_page[page_num].append(citation)
# Highlight each citation
for page_num, citations in citations_by_page.items():
if page_num >= len(doc):
continue
page = doc[page_num]
for citation in citations:
# Get the text to search for (use a small context to ensure accuracy)
search_text = citation.get("text", "")
if not search_text:
continue
# Find all instances of the citation text in the page
text_instances = page.search_for(search_text)
# Highlight each instance
for inst in text_instances:
# Create a yellow highlight annotation
highlight = page.add_highlight_annot(inst)
# Add metadata
highlight.set_info({
"title": f"Citation {citation.get('citation_index', '')}",
"content": f"Source: {citation.get('source_text', '')}"
})
# Save the highlighted PDF to a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_file.close()
doc.save(temp_file.name)
doc.close()
return temp_file.name
def generate_page_thumbnails(self, max_pages: int = 5) -> List[Dict[str, Any]]:
"""
Generate thumbnails for the first few pages of the PDF.
Args:
max_pages: Maximum number of pages to generate thumbnails for
Returns:
List of page thumbnails as data URIs
"""
thumbnails = []
try:
doc = fitz.open(self.pdf_path)
pages_to_process = min(max_pages, len(doc))
for page_num in range(pages_to_process):
page = doc[page_num]
# Render page to an image
pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # Reduced size for thumbnails
# Convert to data URI
img_data = pix.tobytes("png")
b64_data = base64.b64encode(img_data).decode()
data_uri = f"data:image/png;base64,{b64_data}"
thumbnails.append({
"page": page_num + 1,
"thumbnail": data_uri
})
doc.close()
except Exception as e:
print(f"Error generating thumbnails: {e}")
return thumbnails |