Spaces:
Running
Running
| """ | |
| PDF Extractor Tool — extracts text and structured data from pitch deck PDFs. | |
| Supports both local file paths and bytes from Gradio uploads. | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import json | |
| from pathlib import Path | |
| from typing import Union | |
| from crewai.tools import BaseTool | |
| from loguru import logger | |
| from pydantic import Field | |
| class PDFExtractorTool(BaseTool): | |
| name: str = "pdf_extractor" | |
| description: str = ( | |
| "Extract text content and structure from a PDF pitch deck. " | |
| "Input: absolute file path to the PDF. " | |
| "Output: JSON with pages, full_text, slide_titles, and key_figures." | |
| ) | |
| # Allow-list of directories where PDFs may be read from. | |
| # Defaults to system temp dir (Gradio uploads land there). | |
| _ALLOWED_ROOTS: tuple[str, ...] = ( | |
| "/tmp", | |
| "/var/folders", # macOS temp | |
| ) | |
| def _is_allowed_path(self, path: Path) -> bool: | |
| resolved = str(path.resolve()) | |
| return any(resolved.startswith(root) for root in self._ALLOWED_ROOTS) | |
| def _run(self, file_path: str) -> str: | |
| """Extract content from a PDF file.""" | |
| try: | |
| path = Path(file_path).resolve() | |
| if not path.exists(): | |
| return json.dumps({"error": "File not found."}) | |
| if path.suffix.lower() != ".pdf": | |
| return json.dumps({"error": "Expected a .pdf file."}) | |
| if not self._is_allowed_path(path): | |
| return json.dumps({"error": "File path is outside the permitted upload directory."}) | |
| return self._extract_with_pdfplumber(path) | |
| except Exception as exc: | |
| logger.error(f"PDFExtractorTool error: {exc}") | |
| return json.dumps({"error": str(exc)}) | |
| def _extract_with_pdfplumber(self, path: Path) -> str: | |
| import pdfplumber | |
| pages = [] | |
| slide_titles: list[str] = [] | |
| full_text_parts: list[str] = [] | |
| with pdfplumber.open(str(path)) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| text = page.extract_text() or "" | |
| tables = page.extract_tables() or [] | |
| # Heuristic: first non-empty line of each page is likely the slide title | |
| lines = [ln.strip() for ln in text.splitlines() if ln.strip()] | |
| title = lines[0] if lines else f"Page {page_num}" | |
| slide_titles.append(title) | |
| full_text_parts.append(text) | |
| pages.append( | |
| { | |
| "page": page_num, | |
| "title": title, | |
| "text": text, | |
| "tables": [ | |
| [[str(cell) if cell else "" for cell in row] for row in table] | |
| for table in tables | |
| ], | |
| } | |
| ) | |
| result = { | |
| "file": str(path.name), | |
| "total_pages": len(pages), | |
| "slide_titles": slide_titles, | |
| "full_text": "\n\n".join(full_text_parts), | |
| "pages": pages, | |
| } | |
| return json.dumps(result, ensure_ascii=False, indent=2) | |