MissionControlMCP / tools /pdf_reader.py
AlBaraa63's picture
Initial commit: MissionControlMCP - 8 Enterprise Automation Tools
c3de917
"""
PDF Reader Tool - Extract text and metadata from PDF files
"""
import logging
from typing import Dict, Any
from pathlib import Path
logger = logging.getLogger(__name__)
def read_pdf(file_path: str) -> Dict[str, Any]:
"""
Read and extract text from a PDF file.
Args:
file_path: Path to the PDF file
Returns:
Dictionary containing extracted text, page count, and metadata
"""
try:
from PyPDF2 import PdfReader
# Validate file exists
if not Path(file_path).exists():
raise FileNotFoundError(f"PDF file not found: {file_path}")
# Read PDF
reader = PdfReader(file_path)
# Extract text from all pages
text_parts = []
for page_num, page in enumerate(reader.pages, 1):
try:
text = page.extract_text()
if text:
text_parts.append(f"--- Page {page_num} ---\n{text}")
except Exception as e:
logger.warning(f"Failed to extract text from page {page_num}: {e}")
text_parts.append(f"--- Page {page_num} ---\n[Extraction failed]")
full_text = "\n\n".join(text_parts)
# Extract metadata
metadata = {}
if reader.metadata:
metadata = {
"author": reader.metadata.get("/Author", "Unknown"),
"creator": reader.metadata.get("/Creator", "Unknown"),
"producer": reader.metadata.get("/Producer", "Unknown"),
"subject": reader.metadata.get("/Subject", "Unknown"),
"title": reader.metadata.get("/Title", "Unknown"),
"creation_date": str(reader.metadata.get("/CreationDate", "Unknown"))
}
return {
"text": full_text,
"pages": len(reader.pages),
"metadata": metadata
}
except ImportError:
logger.error("PyPDF2 not installed. Install with: pip install pypdf2")
raise
except Exception as e:
logger.error(f"Error reading PDF: {e}")
raise
def get_pdf_info(file_path: str) -> Dict[str, Any]:
"""
Get basic information about a PDF without extracting all text.
Args:
file_path: Path to the PDF file
Returns:
Dictionary with PDF information
"""
try:
from PyPDF2 import PdfReader
reader = PdfReader(file_path)
return {
"page_count": len(reader.pages),
"is_encrypted": reader.is_encrypted,
"file_size_bytes": Path(file_path).stat().st_size,
"file_name": Path(file_path).name
}
except Exception as e:
logger.error(f"Error getting PDF info: {e}")
raise