Spaces:
Running
Running
File size: 3,176 Bytes
8dcf472 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | """
PDF Extractor Tool — extracts text and structured data from pitch deck PDFs.
Supports both local file paths and bytes from Gradio uploads.
"""
from __future__ import annotations
import io
import json
from pathlib import Path
from typing import Union
from crewai.tools import BaseTool
from loguru import logger
from pydantic import Field
class PDFExtractorTool(BaseTool):
name: str = "pdf_extractor"
description: str = (
"Extract text content and structure from a PDF pitch deck. "
"Input: absolute file path to the PDF. "
"Output: JSON with pages, full_text, slide_titles, and key_figures."
)
# Allow-list of directories where PDFs may be read from.
# Defaults to system temp dir (Gradio uploads land there).
_ALLOWED_ROOTS: tuple[str, ...] = (
"/tmp",
"/var/folders", # macOS temp
)
def _is_allowed_path(self, path: Path) -> bool:
resolved = str(path.resolve())
return any(resolved.startswith(root) for root in self._ALLOWED_ROOTS)
def _run(self, file_path: str) -> str:
"""Extract content from a PDF file."""
try:
path = Path(file_path).resolve()
if not path.exists():
return json.dumps({"error": "File not found."})
if path.suffix.lower() != ".pdf":
return json.dumps({"error": "Expected a .pdf file."})
if not self._is_allowed_path(path):
return json.dumps({"error": "File path is outside the permitted upload directory."})
return self._extract_with_pdfplumber(path)
except Exception as exc:
logger.error(f"PDFExtractorTool error: {exc}")
return json.dumps({"error": str(exc)})
def _extract_with_pdfplumber(self, path: Path) -> str:
import pdfplumber
pages = []
slide_titles: list[str] = []
full_text_parts: list[str] = []
with pdfplumber.open(str(path)) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text() or ""
tables = page.extract_tables() or []
# Heuristic: first non-empty line of each page is likely the slide title
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
title = lines[0] if lines else f"Page {page_num}"
slide_titles.append(title)
full_text_parts.append(text)
pages.append(
{
"page": page_num,
"title": title,
"text": text,
"tables": [
[[str(cell) if cell else "" for cell in row] for row in table]
for table in tables
],
}
)
result = {
"file": str(path.name),
"total_pages": len(pages),
"slide_titles": slide_titles,
"full_text": "\n\n".join(full_text_parts),
"pages": pages,
}
return json.dumps(result, ensure_ascii=False, indent=2)
|