File size: 3,035 Bytes
0eec92d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

import os
import io
import logging
from typing import List, Optional
import fitz  # pymupdf
import pytesseract
from PIL import Image
from langchain_core.documents import Document

# Set up logging
logger = logging.getLogger(__name__)

# Configure Tesseract path if needed (Windows usually requires this if not in PATH)
# If tesseract is in PATH, this might not be needed, but good to have as a fallback or config
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' 

def extract_text_from_pdf_with_ocr(pdf_path: str, pages_to_ocr: Optional[List[int]] = None) -> List[Document]:
    """

    Extracts text from a PDF using OCR for specified pages or all pages.

    

    Args:

        pdf_path: Path to the PDF file.

        pages_to_ocr: List of 0-indexed page numbers to perform OCR on. 

                      If None, OCR is performed on all pages.

                      

    Returns:

        List of LangChain Document objects with extracted text.

    """
    docs = []
    
    try:
        doc = fitz.open(pdf_path)
        
        # Determine which pages to process
        if pages_to_ocr is None:
            pages_to_process = range(len(doc))
        else:
            pages_to_process = pages_to_ocr
            
        logger.info(f"Starting OCR extraction for {len(pages_to_process)} pages in {os.path.basename(pdf_path)}")
        
        for page_num in pages_to_process:
            if page_num >= len(doc):
                logger.warning(f"Page {page_num} out of range for document with {len(doc)} pages")
                continue
                
            page = doc.load_page(page_num)
            
            # Convert page to image
            # Zoom = 3 (approx 216 dpi) improves accuracy significantly for small text/tables
            mat = fitz.Matrix(3, 3) 
            pix = page.get_pixmap(matrix=mat)
            
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            image = Image.open(io.BytesIO(img_data))
            
            # Preprocessing: Convert to grayscale
            image = image.convert('L')
            
            # Optional: Simple thresholding (binarization) can help if contrast is poor
            # point_fn = lambda x: 0 if x < 128 else 255
            # image = image.point(point_fn, '1')
            
            # Perform OCR
            text = pytesseract.image_to_string(image)
            
            # Create Document object
            metadata = {
                "source": pdf_path,
                "page": page_num,
                "extraction_method": "ocr"
            }
            
            docs.append(Document(page_content=text, metadata=metadata))
            
        doc.close()
        logger.info(f"Completed OCR extraction. Generated {len(docs)} documents.")
        
    except Exception as e:
        logger.error(f"OCR extraction failed: {e}")
        raise e
        
    return docs