""" PDF Parser Tool — Extracts text from uploaded PDF files. Assigned To: Paper Extractor agent ONLY Reference: system_design.md — Tool 1 (Lines 409-436) Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation (Lines 29-61) Key guardrails: - Input validation: file extension, file exists, file size <20MB - Returns error STRINGS, never raises exceptions - Text capped at 50,000 chars to prevent token overflow """ import os import pdfplumber from crewai.tools import tool @tool def pdf_parser_tool(file_path: str) -> str: """Extract text content from a PDF file. Validates file type and size before extraction.""" # === INPUT VALIDATION === if not file_path: return "ERROR: No file path provided." if not file_path.endswith(".pdf"): return "ERROR: File must be a .pdf file. Got: " + file_path.split(".")[-1] if not os.path.exists(file_path): return "ERROR: File not found at path: " + file_path file_size_mb = os.path.getsize(file_path) / (1024 * 1024) if file_size_mb > 20: return f"ERROR: File too large ({file_size_mb:.1f}MB). Maximum is 20MB." # === EXECUTION (only if validation passes) === try: with pdfplumber.open(file_path) as pdf: text = "\n".join(page.extract_text() or "" for page in pdf.pages) if len(text.strip()) < 100: return "ERROR: PDF contains insufficient extractable text (possibly scanned/image-only)." return text # Full text — GPT-4o handles 128k tokens except Exception as e: return f"ERROR: Failed to parse PDF — {type(e).__name__}: {str(e)}"