import os import pytesseract from langchain_community.document_loaders import PyMuPDFLoader from langchain_core.documents import Document from pdf2image import convert_from_path class OCREnhancedPDFLoader: """Loads PDFs with OCR support for text extraction""" BLANK_THRESHOLD = 10 # FIXED: Removed Windows default path def __init__(self, file_path: str, tesseract_path: str = None): if not os.path.isfile(file_path): raise FileNotFoundError(f"PDF file not found at path: {file_path}") self.file_path = file_path self.skipped_pages = [] # Only set cmd if specific path provided, otherwise trust Linux PATH if tesseract_path: if not os.path.isfile(tesseract_path): raise ValueError(f"Tesseract executable not found at path: {tesseract_path}") pytesseract.pytesseract.tesseract_cmd = tesseract_path def _is_blank_page(self, text: str) -> bool: if not text or not text.strip(): return True cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '') return len(cleaned_text) < self.BLANK_THRESHOLD def _process_page(self, doc, img, page_number: int): existing_text = doc.page_content # Use existing text if substantial if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5: combined_text = existing_text ocr_used = False else: # Fallback to OCR try: ocr_text = pytesseract.image_to_string(img) combined_text = ocr_text ocr_used = True except Exception as e: print(f"Error applying OCR to page {page_number}: {e}") combined_text = existing_text ocr_used = False if self._is_blank_page(combined_text): self.skipped_pages.append(page_number) return None return Document( page_content=combined_text, metadata={ **doc.metadata, "source": "ocr" if ocr_used else "text_extraction", "page": page_number, "is_blank": "false", "has_ocr": str(ocr_used) } ) def load(self): try: # 1. Standard Load loader = PyMuPDFLoader(self.file_path) text_documents = loader.load() # 2. Image Conversion (Linux requires poppler-utils installed) images = convert_from_path(self.file_path, dpi=300) enhanced_documents = [] for idx, (doc, img) in enumerate(zip(text_documents, images)): page_number = idx + 1 enhanced_doc = self._process_page(doc, img, page_number) if enhanced_doc: enhanced_documents.append(enhanced_doc) if self.skipped_pages: print(f"Skipped blank pages: {self.skipped_pages}") return enhanced_documents except Exception as e: print(f"Error in OCR-enhanced loading: {e}") raise