Spaces:
Runtime error
Runtime error
File size: 2,975 Bytes
6325f00 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | """
PDF text extraction utilities.
Supports multiple extraction methods with fallbacks.
"""
from pathlib import Path
from typing import Optional
def extract_text_pypdf(pdf_path: str) -> str:
"""
Extract text using PyPDF2 (faster, less accurate).
Args:
pdf_path: Path to PDF file
Returns:
Extracted text
"""
try:
import PyPDF2
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error extracting PDF with PyPDF2: {str(e)}"
def extract_text_pdfplumber(pdf_path: str, max_pages: int = 10) -> str:
"""
Extract text using pdfplumber (slower, more accurate).
Args:
pdf_path: Path to PDF file
max_pages: Maximum pages to extract (for speed)
Returns:
Extracted text
"""
try:
import pdfplumber
text = ""
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages[:max_pages]):
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error extracting PDF with pdfplumber: {str(e)}"
def extract_text(
pdf_path: str,
method: str = "auto",
max_pages: int = 10
) -> str:
"""
Extract text from PDF using specified method.
Args:
pdf_path: Path to PDF file
method: 'pypdf', 'pdfplumber', or 'auto' (try both)
max_pages: Max pages to extract
Returns:
Extracted text
"""
if not Path(pdf_path).exists():
return f"Error: File not found: {pdf_path}"
if method == "auto":
# Try pdfplumber first (more accurate)
text = extract_text_pdfplumber(pdf_path, max_pages)
if not text.startswith("Error"):
return text
# Fallback to PyPDF2
print("⚠️ pdfplumber failed, trying PyPDF2...")
text = extract_text_pypdf(pdf_path)
if not text.startswith("Error"):
return text
return "Error: All PDF extraction methods failed"
elif method == "pypdf":
return extract_text_pypdf(pdf_path)
elif method == "pdfplumber":
return extract_text_pdfplumber(pdf_path, max_pages)
else:
return f"Error: Unknown method: {method}"
# Test function
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
pdf_path = sys.argv[1]
text = extract_text(pdf_path)
print(text[:1000]) # First 1000 chars
print(f"\n... (Total length: {len(text)} characters)")
else:
print("Usage: python pdf_reader.py <pdf_path>")
|