| | """ |
| | PDF Parser Tool β Extracts text from uploaded PDF files. |
| | |
| | Assigned To: Paper Extractor agent ONLY |
| | Reference: system_design.md β Tool 1 (Lines 409-436) |
| | Reference: engineering_guardrails.md β Β§2 Tool-Call Argument Validation (Lines 29-61) |
| | |
| | Key guardrails: |
| | - Input validation: file extension, file exists, file size <20MB |
| | - Returns error STRINGS, never raises exceptions |
| | - Text capped at 50,000 chars to prevent token overflow |
| | """ |
| |
|
| | import os |
| | import pdfplumber |
| | from crewai.tools import tool |
| |
|
| |
|
| | @tool |
| | def pdf_parser_tool(file_path: str) -> str: |
| | """Extract text content from a PDF file. Validates file type and size before extraction.""" |
| |
|
| | |
| | if not file_path: |
| | return "ERROR: No file path provided." |
| |
|
| | if not file_path.endswith(".pdf"): |
| | return "ERROR: File must be a .pdf file. Got: " + file_path.split(".")[-1] |
| |
|
| | if not os.path.exists(file_path): |
| | return "ERROR: File not found at path: " + file_path |
| |
|
| | file_size_mb = os.path.getsize(file_path) / (1024 * 1024) |
| | if file_size_mb > 20: |
| | return f"ERROR: File too large ({file_size_mb:.1f}MB). Maximum is 20MB." |
| |
|
| | |
| | try: |
| | with pdfplumber.open(file_path) as pdf: |
| | text = "\n".join(page.extract_text() or "" for page in pdf.pages) |
| |
|
| | if len(text.strip()) < 100: |
| | return "ERROR: PDF contains insufficient extractable text (possibly scanned/image-only)." |
| |
|
| | return text |
| |
|
| | except Exception as e: |
| | return f"ERROR: Failed to parse PDF β {type(e).__name__}: {str(e)}" |
| |
|