File size: 4,756 Bytes
3998131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
PDF text extraction module
Handles extraction from legal PDF documents
"""

import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional

try:
    import pdfplumber
    PDFPLUMBER_AVAILABLE = True
except ImportError:
    PDFPLUMBER_AVAILABLE = False

try:
    from PyPDF2 import PdfReader
    PYPDF2_AVAILABLE = True
except ImportError:
    PYPDF2_AVAILABLE = False

from .config import PDF_EXTRACTION_METHOD, PDF_FALLBACK_METHOD

logger = logging.getLogger(__name__)


class PDFExtractor:
    """Extracts text from PDF files with multiple extraction methods"""
    
    def __init__(self, method: str = PDF_EXTRACTION_METHOD):
        """
        Initialize PDF extractor
        
        Args:
            method: Extraction method ('pdfplumber' or 'pypdf2')
        """
        self.method = method
        self._validate_dependencies()
    
    def _validate_dependencies(self):
        """Check if required libraries are available"""
        if self.method == "pdfplumber" and not PDFPLUMBER_AVAILABLE:
            logger.warning("pdfplumber not available, falling back to PyPDF2")
            self.method = "pypdf2"
        
        if self.method == "pypdf2" and not PYPDF2_AVAILABLE:
            raise ImportError("No PDF extraction library available. Install pdfplumber or PyPDF2")
    
    def extract_from_file(self, pdf_path: Path) -> List[Dict[str, any]]:
        """
        Extract text from PDF file
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            List of dicts with 'page_number' and 'text' keys
        """
        logger.info(f"Extracting text from {pdf_path.name} using {self.method}")
        
        try:
            if self.method == "pdfplumber":
                return self._extract_with_pdfplumber(pdf_path)
            else:
                return self._extract_with_pypdf2(pdf_path)
        except Exception as e:
            logger.error(f"Extraction failed with {self.method}: {e}")
            # Try fallback method
            if self.method == "pdfplumber" and PYPDF2_AVAILABLE:
                logger.info("Trying fallback method: PyPDF2")
                return self._extract_with_pypdf2(pdf_path)
            elif self.method == "pypdf2" and PDFPLUMBER_AVAILABLE:
                logger.info("Trying fallback method: pdfplumber")
                return self._extract_with_pdfplumber(pdf_path)
            else:
                raise
    
    def _extract_with_pdfplumber(self, pdf_path: Path) -> List[Dict[str, any]]:
        """Extract using pdfplumber (better for complex layouts)"""
        pages_data = []
        
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                text = page.extract_text()
                if text:
                    pages_data.append({
                        'page_number': page_num,
                        'text': text
                    })
                else:
                    logger.warning(f"No text extracted from page {page_num}")
        
        logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}")
        return pages_data
    
    def _extract_with_pypdf2(self, pdf_path: Path) -> List[Dict[str, any]]:
        """Extract using PyPDF2 (fallback method)"""
        pages_data = []
        
        with open(pdf_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            
            for page_num, page in enumerate(pdf_reader.pages, start=1):
                text = page.extract_text()
                if text:
                    pages_data.append({
                        'page_number': page_num,
                        'text': text
                    })
                else:
                    logger.warning(f"No text extracted from page {page_num}")
        
        logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}")
        return pages_data
    
    def extract_from_directory(self, directory: Path) -> Dict[str, List[Dict[str, any]]]:
        """
        Extract text from all PDFs in a directory
        
        Args:
            directory: Path to directory containing PDFs
            
        Returns:
            Dict mapping filename to list of page data
        """
        results = {}
        pdf_files = list(directory.glob("*.pdf"))
        
        logger.info(f"Found {len(pdf_files)} PDF files in {directory}")
        
        for pdf_file in pdf_files:
            try:
                results[pdf_file.name] = self.extract_from_file(pdf_file)
            except Exception as e:
                logger.error(f"Failed to extract {pdf_file.name}: {e}")
                results[pdf_file.name] = []
        
        return results