Spaces:

AISA-Framework
/

AI-Research-Paper-Analyst

Sleeping

Saleh

Clean deployment to HuggingFace Space

2447eba 21 days ago

1.64 kB

	"""
	PDF Parser Tool — Extracts text from uploaded PDF files.

	Assigned To: Paper Extractor agent ONLY
	Reference: system_design.md — Tool 1 (Lines 409-436)
	Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation (Lines 29-61)

	Key guardrails:
	- Input validation: file extension, file exists, file size <20MB
	- Returns error STRINGS, never raises exceptions
	- Text capped at 50,000 chars to prevent token overflow
	"""

	import os
	import pdfplumber
	from crewai.tools import tool


	@tool
	def pdf_parser_tool(file_path: str) -> str:
	"""Extract text content from a PDF file. Validates file type and size before extraction."""

	# === INPUT VALIDATION ===
	if not file_path:
	return "ERROR: No file path provided."

	if not file_path.endswith(".pdf"):
	return "ERROR: File must be a .pdf file. Got: " + file_path.split(".")[-1]

	if not os.path.exists(file_path):
	return "ERROR: File not found at path: " + file_path

	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
	if file_size_mb > 20:
	return f"ERROR: File too large ({file_size_mb:.1f}MB). Maximum is 20MB."

	# === EXECUTION (only if validation passes) ===
	try:
	with pdfplumber.open(file_path) as pdf:
	text = "\n".join(page.extract_text() or "" for page in pdf.pages)

	if len(text.strip()) < 100:
	return "ERROR: PDF contains insufficient extractable text (possibly scanned/image-only)."

	return text # Full text — GPT-4o handles 128k tokens

	except Exception as e:
	return f"ERROR: Failed to parse PDF — {type(e).__name__}: {str(e)}"