Spaces:

PeterBot22
/

dealflow-ai

Running

File size: 3,176 Bytes

8dcf472

"""
PDF Extractor Tool — extracts text and structured data from pitch deck PDFs.
Supports both local file paths and bytes from Gradio uploads.
"""
from __future__ import annotations

import io
import json
from pathlib import Path
from typing import Union

from crewai.tools import BaseTool
from loguru import logger
from pydantic import Field


class PDFExtractorTool(BaseTool):
    name: str = "pdf_extractor"
    description: str = (
        "Extract text content and structure from a PDF pitch deck. "
        "Input: absolute file path to the PDF. "
        "Output: JSON with pages, full_text, slide_titles, and key_figures."
    )

    # Allow-list of directories where PDFs may be read from.
    # Defaults to system temp dir (Gradio uploads land there).
    _ALLOWED_ROOTS: tuple[str, ...] = (
        "/tmp",
        "/var/folders",  # macOS temp
    )

    def _is_allowed_path(self, path: Path) -> bool:
        resolved = str(path.resolve())
        return any(resolved.startswith(root) for root in self._ALLOWED_ROOTS)

    def _run(self, file_path: str) -> str:
        """Extract content from a PDF file."""
        try:
            path = Path(file_path).resolve()
            if not path.exists():
                return json.dumps({"error": "File not found."})
            if path.suffix.lower() != ".pdf":
                return json.dumps({"error": "Expected a .pdf file."})
            if not self._is_allowed_path(path):
                return json.dumps({"error": "File path is outside the permitted upload directory."})

            return self._extract_with_pdfplumber(path)
        except Exception as exc:
            logger.error(f"PDFExtractorTool error: {exc}")
            return json.dumps({"error": str(exc)})

    def _extract_with_pdfplumber(self, path: Path) -> str:
        import pdfplumber

        pages = []
        slide_titles: list[str] = []
        full_text_parts: list[str] = []

        with pdfplumber.open(str(path)) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                text = page.extract_text() or ""
                tables = page.extract_tables() or []

                # Heuristic: first non-empty line of each page is likely the slide title
                lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
                title = lines[0] if lines else f"Page {page_num}"
                slide_titles.append(title)
                full_text_parts.append(text)

                pages.append(
                    {
                        "page": page_num,
                        "title": title,
                        "text": text,
                        "tables": [
                            [[str(cell) if cell else "" for cell in row] for row in table]
                            for table in tables
                        ],
                    }
                )

        result = {
            "file": str(path.name),
            "total_pages": len(pages),
            "slide_titles": slide_titles,
            "full_text": "\n\n".join(full_text_parts),
            "pages": pages,
        }
        return json.dumps(result, ensure_ascii=False, indent=2)