File size: 2,975 Bytes
6325f00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""

PDF text extraction utilities.

Supports multiple extraction methods with fallbacks.

"""

from pathlib import Path
from typing import Optional


def extract_text_pypdf(pdf_path: str) -> str:
    """

    Extract text using PyPDF2 (faster, less accurate).

    

    Args:

        pdf_path: Path to PDF file

        

    Returns:

        Extracted text

    """
    try:
        import PyPDF2
        
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        return f"Error extracting PDF with PyPDF2: {str(e)}"


def extract_text_pdfplumber(pdf_path: str, max_pages: int = 10) -> str:
    """

    Extract text using pdfplumber (slower, more accurate).

    

    Args:

        pdf_path: Path to PDF file

        max_pages: Maximum pages to extract (for speed)

        

    Returns:

        Extracted text

    """
    try:
        import pdfplumber
        
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages[:max_pages]):
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error extracting PDF with pdfplumber: {str(e)}"


def extract_text(

    pdf_path: str, 

    method: str = "auto", 

    max_pages: int = 10

) -> str:
    """

    Extract text from PDF using specified method.

    

    Args:

        pdf_path: Path to PDF file

        method: 'pypdf', 'pdfplumber', or 'auto' (try both)

        max_pages: Max pages to extract

        

    Returns:

        Extracted text

    """
    if not Path(pdf_path).exists():
        return f"Error: File not found: {pdf_path}"
    
    if method == "auto":
        # Try pdfplumber first (more accurate)
        text = extract_text_pdfplumber(pdf_path, max_pages)
        
        if not text.startswith("Error"):
            return text
        
        # Fallback to PyPDF2
        print("⚠️  pdfplumber failed, trying PyPDF2...")
        text = extract_text_pypdf(pdf_path)
        
        if not text.startswith("Error"):
            return text
        
        return "Error: All PDF extraction methods failed"
    
    elif method == "pypdf":
        return extract_text_pypdf(pdf_path)
    
    elif method == "pdfplumber":
        return extract_text_pdfplumber(pdf_path, max_pages)
    
    else:
        return f"Error: Unknown method: {method}"


# Test function
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        pdf_path = sys.argv[1]
        text = extract_text(pdf_path)
        print(text[:1000])  # First 1000 chars
        print(f"\n... (Total length: {len(text)} characters)")
    else:
        print("Usage: python pdf_reader.py <pdf_path>")