Spaces:

Yusufarsh
/

ReproAgent

Runtime error

File size: 2,975 Bytes

6325f00

"""

PDF text extraction utilities.

Supports multiple extraction methods with fallbacks.

"""

from pathlib import Path
from typing import Optional


def extract_text_pypdf(pdf_path: str) -> str:
    """

    Extract text using PyPDF2 (faster, less accurate).

    

    Args:

        pdf_path: Path to PDF file

        

    Returns:

        Extracted text

    """
    try:
        import PyPDF2
        
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        return f"Error extracting PDF with PyPDF2: {str(e)}"


def extract_text_pdfplumber(pdf_path: str, max_pages: int = 10) -> str:
    """

    Extract text using pdfplumber (slower, more accurate).

    

    Args:

        pdf_path: Path to PDF file

        max_pages: Maximum pages to extract (for speed)

        

    Returns:

        Extracted text

    """
    try:
        import pdfplumber
        
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages[:max_pages]):
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error extracting PDF with pdfplumber: {str(e)}"


def extract_text(

    pdf_path: str, 

    method: str = "auto", 

    max_pages: int = 10

) -> str:
    """

    Extract text from PDF using specified method.

    

    Args:

        pdf_path: Path to PDF file

        method: 'pypdf', 'pdfplumber', or 'auto' (try both)

        max_pages: Max pages to extract

        

    Returns:

        Extracted text

    """
    if not Path(pdf_path).exists():
        return f"Error: File not found: {pdf_path}"
    
    if method == "auto":
        # Try pdfplumber first (more accurate)
        text = extract_text_pdfplumber(pdf_path, max_pages)
        
        if not text.startswith("Error"):
            return text
        
        # Fallback to PyPDF2
        print("⚠️  pdfplumber failed, trying PyPDF2...")
        text = extract_text_pypdf(pdf_path)
        
        if not text.startswith("Error"):
            return text
        
        return "Error: All PDF extraction methods failed"
    
    elif method == "pypdf":
        return extract_text_pypdf(pdf_path)
    
    elif method == "pdfplumber":
        return extract_text_pdfplumber(pdf_path, max_pages)
    
    else:
        return f"Error: Unknown method: {method}"


# Test function
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        pdf_path = sys.argv[1]
        text = extract_text(pdf_path)
        print(text[:1000])  # First 1000 chars
        print(f"\n... (Total length: {len(text)} characters)")
    else:
        print("Usage: python pdf_reader.py <pdf_path>")