File size: 3,356 Bytes
c0f31c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import pytesseract
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from pdf2image import convert_from_path

class OCREnhancedPDFLoader:
    """Loads PDFs with OCR support for text extraction"""
    
    BLANK_THRESHOLD = 10 

    # FIXED: Removed Windows default path
    def __init__(self, file_path: str, tesseract_path: str = None):
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"PDF file not found at path: {file_path}")
        
        self.file_path = file_path
        self.skipped_pages = []
        
        # Only set cmd if specific path provided, otherwise trust Linux PATH
        if tesseract_path:
            if not os.path.isfile(tesseract_path):
                raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
            pytesseract.pytesseract.tesseract_cmd = tesseract_path

    def _is_blank_page(self, text: str) -> bool:
        if not text or not text.strip():
            return True
        cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
        return len(cleaned_text) < self.BLANK_THRESHOLD

    def _process_page(self, doc, img, page_number: int):
        existing_text = doc.page_content
        
        # Use existing text if substantial
        if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5: 
            combined_text = existing_text
            ocr_used = False
        else:
            # Fallback to OCR
            try:
                ocr_text = pytesseract.image_to_string(img)
                combined_text = ocr_text
                ocr_used = True
            except Exception as e:
                print(f"Error applying OCR to page {page_number}: {e}")
                combined_text = existing_text
                ocr_used = False
        
        if self._is_blank_page(combined_text):
            self.skipped_pages.append(page_number)
            return None
        
        return Document(
            page_content=combined_text,
            metadata={
                **doc.metadata,
                "source": "ocr" if ocr_used else "text_extraction",
                "page": page_number,
                "is_blank": "false",
                "has_ocr": str(ocr_used)
            }
        )

    def load(self):
        try:
            # 1. Standard Load
            loader = PyMuPDFLoader(self.file_path)
            text_documents = loader.load()
            
            # 2. Image Conversion (Linux requires poppler-utils installed)
            images = convert_from_path(self.file_path, dpi=300)
            
            enhanced_documents = []
            for idx, (doc, img) in enumerate(zip(text_documents, images)):
                page_number = idx + 1
                enhanced_doc = self._process_page(doc, img, page_number)
                
                if enhanced_doc:
                    enhanced_documents.append(enhanced_doc)
            
            if self.skipped_pages:
                print(f"Skipped blank pages: {self.skipped_pages}")
            
            return enhanced_documents
            
        except Exception as e:
            print(f"Error in OCR-enhanced loading: {e}")
            raise