File size: 1,644 Bytes
2447eba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
PDF Parser Tool — Extracts text from uploaded PDF files.

Assigned To: Paper Extractor agent ONLY
Reference: system_design.md — Tool 1 (Lines 409-436)
Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation (Lines 29-61)

Key guardrails:
  - Input validation: file extension, file exists, file size <20MB
  - Returns error STRINGS, never raises exceptions
  - Text capped at 50,000 chars to prevent token overflow
"""

import os
import pdfplumber
from crewai.tools import tool


@tool
def pdf_parser_tool(file_path: str) -> str:
    """Extract text content from a PDF file. Validates file type and size before extraction."""

    # === INPUT VALIDATION ===
    if not file_path:
        return "ERROR: No file path provided."

    if not file_path.endswith(".pdf"):
        return "ERROR: File must be a .pdf file. Got: " + file_path.split(".")[-1]

    if not os.path.exists(file_path):
        return "ERROR: File not found at path: " + file_path

    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    if file_size_mb > 20:
        return f"ERROR: File too large ({file_size_mb:.1f}MB). Maximum is 20MB."

    # === EXECUTION (only if validation passes) ===
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(page.extract_text() or "" for page in pdf.pages)

        if len(text.strip()) < 100:
            return "ERROR: PDF contains insufficient extractable text (possibly scanned/image-only)."

        return text  # Full text — GPT-4o handles 128k tokens

    except Exception as e:
        return f"ERROR: Failed to parse PDF — {type(e).__name__}: {str(e)}"