import os import io import logging from typing import List, Optional import fitz # pymupdf import pytesseract from PIL import Image from langchain_core.documents import Document # Set up logging logger = logging.getLogger(__name__) # Configure Tesseract path if needed (Windows usually requires this if not in PATH) # If tesseract is in PATH, this might not be needed, but good to have as a fallback or config # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' def extract_text_from_pdf_with_ocr(pdf_path: str, pages_to_ocr: Optional[List[int]] = None) -> List[Document]: """ Extracts text from a PDF using OCR for specified pages or all pages. Args: pdf_path: Path to the PDF file. pages_to_ocr: List of 0-indexed page numbers to perform OCR on. If None, OCR is performed on all pages. Returns: List of LangChain Document objects with extracted text. """ docs = [] try: doc = fitz.open(pdf_path) # Determine which pages to process if pages_to_ocr is None: pages_to_process = range(len(doc)) else: pages_to_process = pages_to_ocr logger.info(f"Starting OCR extraction for {len(pages_to_process)} pages in {os.path.basename(pdf_path)}") for page_num in pages_to_process: if page_num >= len(doc): logger.warning(f"Page {page_num} out of range for document with {len(doc)} pages") continue page = doc.load_page(page_num) # Convert page to image # Zoom = 3 (approx 216 dpi) improves accuracy significantly for small text/tables mat = fitz.Matrix(3, 3) pix = page.get_pixmap(matrix=mat) # Convert to PIL Image img_data = pix.tobytes("png") image = Image.open(io.BytesIO(img_data)) # Preprocessing: Convert to grayscale image = image.convert('L') # Optional: Simple thresholding (binarization) can help if contrast is poor # point_fn = lambda x: 0 if x < 128 else 255 # image = image.point(point_fn, '1') # Perform OCR text = pytesseract.image_to_string(image) # Create Document object metadata = { "source": pdf_path, "page": page_num, "extraction_method": "ocr" } docs.append(Document(page_content=text, metadata=metadata)) doc.close() logger.info(f"Completed OCR extraction. Generated {len(docs)} documents.") except Exception as e: logger.error(f"OCR extraction failed: {e}") raise e return docs