""" PDF Reader Tool - Extract text and metadata from PDF files """ import logging from typing import Dict, Any from pathlib import Path logger = logging.getLogger(__name__) def read_pdf(file_path: str) -> Dict[str, Any]: """ Read and extract text from a PDF file. Args: file_path: Path to the PDF file Returns: Dictionary containing extracted text, page count, and metadata """ try: from PyPDF2 import PdfReader # Validate file exists if not Path(file_path).exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") # Read PDF reader = PdfReader(file_path) # Extract text from all pages text_parts = [] for page_num, page in enumerate(reader.pages, 1): try: text = page.extract_text() if text: text_parts.append(f"--- Page {page_num} ---\n{text}") except Exception as e: logger.warning(f"Failed to extract text from page {page_num}: {e}") text_parts.append(f"--- Page {page_num} ---\n[Extraction failed]") full_text = "\n\n".join(text_parts) # Extract metadata metadata = {} if reader.metadata: metadata = { "author": reader.metadata.get("/Author", "Unknown"), "creator": reader.metadata.get("/Creator", "Unknown"), "producer": reader.metadata.get("/Producer", "Unknown"), "subject": reader.metadata.get("/Subject", "Unknown"), "title": reader.metadata.get("/Title", "Unknown"), "creation_date": str(reader.metadata.get("/CreationDate", "Unknown")) } return { "text": full_text, "pages": len(reader.pages), "metadata": metadata } except ImportError: logger.error("PyPDF2 not installed. Install with: pip install pypdf2") raise except Exception as e: logger.error(f"Error reading PDF: {e}") raise def get_pdf_info(file_path: str) -> Dict[str, Any]: """ Get basic information about a PDF without extracting all text. Args: file_path: Path to the PDF file Returns: Dictionary with PDF information """ try: from PyPDF2 import PdfReader reader = PdfReader(file_path) return { "page_count": len(reader.pages), "is_encrypted": reader.is_encrypted, "file_size_bytes": Path(file_path).stat().st_size, "file_name": Path(file_path).name } except Exception as e: logger.error(f"Error getting PDF info: {e}") raise