""" PDF Format Handler Extracts text and layout information from PDF files using pdfplumber. Supports text extraction with character-level position mapping. Usage: from potato.format_handlers.pdf_handler import PDFHandler handler = PDFHandler() output = handler.extract("document.pdf", { "extraction_mode": "text", # or "layout" "max_pages": 10, }) # Access extracted content text = output.text html = output.rendered_html coords = output.coordinate_map """ from typing import Dict, List, Any, Optional from pathlib import Path import html import logging from .base import BaseFormatHandler, FormatOutput from .coordinate_mapping import CoordinateMapper, PDFCoordinate logger = logging.getLogger(__name__) # Check if pdfplumber is available try: import pdfplumber PDFPLUMBER_AVAILABLE = True except ImportError: PDFPLUMBER_AVAILABLE = False pdfplumber = None class PDFHandler(BaseFormatHandler): """ Handler for PDF documents. Uses pdfplumber for text extraction with position information. Generates HTML representation suitable for span annotation. """ format_name = "pdf" supported_extensions = [".pdf"] description = "PDF document text extraction with page/position mapping" requires_dependencies = ["pdfplumber"] def get_default_options(self) -> Dict[str, Any]: """Get default extraction options.""" return { "extraction_mode": "text", # "text" or "layout" "preserve_layout": False, "max_pages": None, "include_page_breaks": True, "page_separator": "\n\n--- Page {page} ---\n\n", "extract_tables": False, "x_tolerance": 3, # Horizontal tolerance for word grouping "y_tolerance": 3, # Vertical tolerance for line grouping } def extract( self, file_path: str, options: Optional[Dict[str, Any]] = None ) -> FormatOutput: """ Extract text and layout from a PDF file. Args: file_path: Path to the PDF file options: Extraction options: - extraction_mode: "text" (plain) or "layout" (preserve layout) - max_pages: Maximum pages to process (None for all) - include_page_breaks: Include page separators in text - page_separator: Format string for page breaks ({page} replaced) - extract_tables: Also extract table structures Returns: FormatOutput with extracted text, HTML, and coordinate mappings """ if not PDFPLUMBER_AVAILABLE: raise ImportError( "pdfplumber is required for PDF extraction. " "Install with: pip install pdfplumber" ) opts = self.merge_options(options) mapper = CoordinateMapper() text_parts = [] html_parts = [] current_offset = 0 metadata = { "format": "pdf", "pages": [], "total_pages": 0, "source_file": str(file_path), } html_parts.append('
') with pdfplumber.open(file_path) as pdf: metadata["total_pages"] = len(pdf.pages) max_pages = opts.get("max_pages") or len(pdf.pages) for page_num, page in enumerate(pdf.pages[:max_pages], start=1): page_text, page_html, page_coords = self._extract_page( page, page_num, opts, current_offset ) # Add page coordinates to mapper for coord_info in page_coords: mapper.add_mapping( coord_info["start"], coord_info["end"], PDFCoordinate( page=page_num, bbox=coord_info.get("bbox", []), line=coord_info.get("line"), ) ) # Add page separator if page_num > 1 and opts.get("include_page_breaks"): separator = opts["page_separator"].format(page=page_num) text_parts.append(separator) current_offset += len(separator) text_parts.append(page_text) html_parts.append(page_html) current_offset += len(page_text) # Page metadata page_meta = { "page_number": page_num, "width": float(page.width), "height": float(page.height), "char_count": len(page_text), } metadata["pages"].append(page_meta) html_parts.append('
') full_text = "".join(text_parts) full_html = "\n".join(html_parts) # Create output with coordinate lookup function coord_dict = mapper.to_dict() coord_dict["get_coords_for_range"] = mapper.get_coords_for_range return FormatOutput( text=full_text, rendered_html=full_html, coordinate_map=coord_dict, metadata=metadata, format_name=self.format_name, source_path=str(file_path), ) def _extract_page( self, page, page_num: int, opts: Dict[str, Any], base_offset: int ) -> tuple: """ Extract text and HTML from a single page. Returns: Tuple of (text, html, coordinate_mappings) """ extraction_mode = opts.get("extraction_mode", "text") if extraction_mode == "layout": return self._extract_page_layout(page, page_num, opts, base_offset) else: return self._extract_page_text(page, page_num, opts, base_offset) def _extract_page_text( self, page, page_num: int, opts: Dict[str, Any], base_offset: int ) -> tuple: """ Extract text with word-level coordinate mapping. """ text_parts = [] html_parts = [] coords = [] current_offset = base_offset # Extract words with their positions words = page.extract_words( x_tolerance=opts.get("x_tolerance", 3), y_tolerance=opts.get("y_tolerance", 3), ) html_parts.append(f'
') if not words: # Fall back to full text extraction if no words found text = page.extract_text() or "" text_parts.append(text) html_parts.append(f'{html.escape(text)}') if text: coords.append({ "start": current_offset, "end": current_offset + len(text), "bbox": [0, 0, float(page.width), float(page.height)], }) else: # Process words with positions current_line_top = None line_words = [] for word in words: word_top = word["top"] # Check if this is a new line if current_line_top is None: current_line_top = word_top elif abs(word_top - current_line_top) > opts.get("y_tolerance", 3): # Flush current line if line_words: line_text, line_html, line_coords = self._process_line( line_words, current_offset ) text_parts.append(line_text) text_parts.append("\n") html_parts.append(line_html) html_parts.append("
") coords.extend(line_coords) current_offset += len(line_text) + 1 # +1 for newline line_words = [] current_line_top = word_top line_words.append(word) # Process final line if line_words: line_text, line_html, line_coords = self._process_line( line_words, current_offset ) text_parts.append(line_text) html_parts.append(line_html) coords.extend(line_coords) html_parts.append('
') return "".join(text_parts), "\n".join(html_parts), coords def _process_line( self, words: List[Dict], base_offset: int ) -> tuple: """ Process a line of words into text, HTML, and coordinates. """ text_parts = [] html_parts = [] coords = [] current_offset = base_offset for i, word in enumerate(words): word_text = word["text"] # Add space between words if i > 0: text_parts.append(" ") current_offset += 1 start = current_offset end = start + len(word_text) text_parts.append(word_text) html_parts.append( f'' f'{html.escape(word_text)}' ) # Store coordinate mapping coords.append({ "start": start, "end": end, "bbox": [ float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"]), ], }) current_offset = end return "".join(text_parts), " ".join(html_parts), coords def _extract_page_layout( self, page, page_num: int, opts: Dict[str, Any], base_offset: int ) -> tuple: """ Extract text preserving visual layout. """ # Use extract_text with layout preservation text = page.extract_text(layout=True) or "" html_parts = [] html_parts.append(f'
') html_parts.append(f'
{html.escape(text)}
') html_parts.append('
') # For layout mode, we map the entire page coords = [{ "start": base_offset, "end": base_offset + len(text), "bbox": [0, 0, float(page.width), float(page.height)], }] return text, "\n".join(html_parts), coords def get_page_count(self, file_path: str) -> int: """ Get the number of pages in a PDF. Args: file_path: Path to the PDF file Returns: Number of pages """ if not PDFPLUMBER_AVAILABLE: raise ImportError("pdfplumber is required") with pdfplumber.open(file_path) as pdf: return len(pdf.pages) def extract_page( self, file_path: str, page_number: int, options: Optional[Dict[str, Any]] = None ) -> FormatOutput: """ Extract a single page from a PDF. Args: file_path: Path to the PDF file page_number: Page number (1-indexed) options: Extraction options Returns: FormatOutput for the single page """ if not PDFPLUMBER_AVAILABLE: raise ImportError("pdfplumber is required") opts = self.merge_options(options) opts["max_pages"] = page_number # Process up to this page opts["include_page_breaks"] = False # Extract only the requested page mapper = CoordinateMapper() with pdfplumber.open(file_path) as pdf: if page_number < 1 or page_number > len(pdf.pages): raise ValueError( f"Page {page_number} out of range (1-{len(pdf.pages)})" ) page = pdf.pages[page_number - 1] page_text, page_html, page_coords = self._extract_page( page, page_number, opts, 0 ) for coord_info in page_coords: mapper.add_mapping( coord_info["start"], coord_info["end"], PDFCoordinate( page=page_number, bbox=coord_info.get("bbox", []), ) ) coord_dict = mapper.to_dict() coord_dict["get_coords_for_range"] = mapper.get_coords_for_range return FormatOutput( text=page_text, rendered_html=page_html, coordinate_map=coord_dict, metadata={ "format": "pdf", "page_number": page_number, "total_pages": len(pdf.pages), }, format_name=self.format_name, source_path=str(file_path), )