Saleh
Clean deployment to HuggingFace Space
2447eba
"""
PDF Parser Tool β€” Extracts text from uploaded PDF files.
Assigned To: Paper Extractor agent ONLY
Reference: system_design.md β€” Tool 1 (Lines 409-436)
Reference: engineering_guardrails.md β€” Β§2 Tool-Call Argument Validation (Lines 29-61)
Key guardrails:
- Input validation: file extension, file exists, file size <20MB
- Returns error STRINGS, never raises exceptions
- Text capped at 50,000 chars to prevent token overflow
"""
import os
import pdfplumber
from crewai.tools import tool
@tool
def pdf_parser_tool(file_path: str) -> str:
"""Extract text content from a PDF file. Validates file type and size before extraction."""
# === INPUT VALIDATION ===
if not file_path:
return "ERROR: No file path provided."
if not file_path.endswith(".pdf"):
return "ERROR: File must be a .pdf file. Got: " + file_path.split(".")[-1]
if not os.path.exists(file_path):
return "ERROR: File not found at path: " + file_path
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > 20:
return f"ERROR: File too large ({file_size_mb:.1f}MB). Maximum is 20MB."
# === EXECUTION (only if validation passes) ===
try:
with pdfplumber.open(file_path) as pdf:
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
if len(text.strip()) < 100:
return "ERROR: PDF contains insufficient extractable text (possibly scanned/image-only)."
return text # Full text β€” GPT-4o handles 128k tokens
except Exception as e:
return f"ERROR: Failed to parse PDF β€” {type(e).__name__}: {str(e)}"