""" PDF Extractor Tool — extracts text and structured data from pitch deck PDFs. Supports both local file paths and bytes from Gradio uploads. """ from __future__ import annotations import io import json from pathlib import Path from typing import Union from crewai.tools import BaseTool from loguru import logger from pydantic import Field class PDFExtractorTool(BaseTool): name: str = "pdf_extractor" description: str = ( "Extract text content and structure from a PDF pitch deck. " "Input: absolute file path to the PDF. " "Output: JSON with pages, full_text, slide_titles, and key_figures." ) # Allow-list of directories where PDFs may be read from. # Defaults to system temp dir (Gradio uploads land there). _ALLOWED_ROOTS: tuple[str, ...] = ( "/tmp", "/var/folders", # macOS temp ) def _is_allowed_path(self, path: Path) -> bool: resolved = str(path.resolve()) return any(resolved.startswith(root) for root in self._ALLOWED_ROOTS) def _run(self, file_path: str) -> str: """Extract content from a PDF file.""" try: path = Path(file_path).resolve() if not path.exists(): return json.dumps({"error": "File not found."}) if path.suffix.lower() != ".pdf": return json.dumps({"error": "Expected a .pdf file."}) if not self._is_allowed_path(path): return json.dumps({"error": "File path is outside the permitted upload directory."}) return self._extract_with_pdfplumber(path) except Exception as exc: logger.error(f"PDFExtractorTool error: {exc}") return json.dumps({"error": str(exc)}) def _extract_with_pdfplumber(self, path: Path) -> str: import pdfplumber pages = [] slide_titles: list[str] = [] full_text_parts: list[str] = [] with pdfplumber.open(str(path)) as pdf: for page_num, page in enumerate(pdf.pages, start=1): text = page.extract_text() or "" tables = page.extract_tables() or [] # Heuristic: first non-empty line of each page is likely the slide title lines = [ln.strip() for ln in text.splitlines() if ln.strip()] title = lines[0] if lines else f"Page {page_num}" slide_titles.append(title) full_text_parts.append(text) pages.append( { "page": page_num, "title": title, "text": text, "tables": [ [[str(cell) if cell else "" for cell in row] for row in table] for table in tables ], } ) result = { "file": str(path.name), "total_pages": len(pages), "slide_titles": slide_titles, "full_text": "\n\n".join(full_text_parts), "pages": pages, } return json.dumps(result, ensure_ascii=False, indent=2)