Spaces:

PeterBot22
/

dealflow-ai

Running

App Files Files Community

dealflow-ai / src /tools /pdf_extractor.py

PeterBot22

feat: DealFlow AI MVP — 3-agent CrewAI due diligence system on HF Spaces

8dcf472 verified 18 days ago

raw

history blame contribute delete

3.18 kB

	"""
	PDF Extractor Tool — extracts text and structured data from pitch deck PDFs.
	Supports both local file paths and bytes from Gradio uploads.
	"""
	from __future__ import annotations

	import io
	import json
	from pathlib import Path
	from typing import Union

	from crewai.tools import BaseTool
	from loguru import logger
	from pydantic import Field


	class PDFExtractorTool(BaseTool):
	name: str = "pdf_extractor"
	description: str = (
	"Extract text content and structure from a PDF pitch deck. "
	"Input: absolute file path to the PDF. "
	"Output: JSON with pages, full_text, slide_titles, and key_figures."
	)

	# Allow-list of directories where PDFs may be read from.
	# Defaults to system temp dir (Gradio uploads land there).
	_ALLOWED_ROOTS: tuple[str, ...] = (
	"/tmp",
	"/var/folders", # macOS temp
	)

	def _is_allowed_path(self, path: Path) -> bool:
	resolved = str(path.resolve())
	return any(resolved.startswith(root) for root in self._ALLOWED_ROOTS)

	def _run(self, file_path: str) -> str:
	"""Extract content from a PDF file."""
	try:
	path = Path(file_path).resolve()
	if not path.exists():
	return json.dumps({"error": "File not found."})
	if path.suffix.lower() != ".pdf":
	return json.dumps({"error": "Expected a .pdf file."})
	if not self._is_allowed_path(path):
	return json.dumps({"error": "File path is outside the permitted upload directory."})

	return self._extract_with_pdfplumber(path)
	except Exception as exc:
	logger.error(f"PDFExtractorTool error: {exc}")
	return json.dumps({"error": str(exc)})

	def _extract_with_pdfplumber(self, path: Path) -> str:
	import pdfplumber

	pages = []
	slide_titles: list[str] = []
	full_text_parts: list[str] = []

	with pdfplumber.open(str(path)) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1):
	text = page.extract_text() or ""
	tables = page.extract_tables() or []

	# Heuristic: first non-empty line of each page is likely the slide title
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	title = lines[0] if lines else f"Page {page_num}"
	slide_titles.append(title)
	full_text_parts.append(text)

	pages.append(
	{
	"page": page_num,
	"title": title,
	"text": text,
	"tables": [
	[[str(cell) if cell else "" for cell in row] for row in table]
	for table in tables
	],
	}
	)

	result = {
	"file": str(path.name),
	"total_pages": len(pages),
	"slide_titles": slide_titles,
	"full_text": "\n\n".join(full_text_parts),
	"pages": pages,
	}
	return json.dumps(result, ensure_ascii=False, indent=2)