Spaces:
Sleeping
Sleeping
File size: 3,035 Bytes
0eec92d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import io
import logging
from typing import List, Optional
import fitz # pymupdf
import pytesseract
from PIL import Image
from langchain_core.documents import Document
# Set up logging
logger = logging.getLogger(__name__)
# Configure Tesseract path if needed (Windows usually requires this if not in PATH)
# If tesseract is in PATH, this might not be needed, but good to have as a fallback or config
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_text_from_pdf_with_ocr(pdf_path: str, pages_to_ocr: Optional[List[int]] = None) -> List[Document]:
"""
Extracts text from a PDF using OCR for specified pages or all pages.
Args:
pdf_path: Path to the PDF file.
pages_to_ocr: List of 0-indexed page numbers to perform OCR on.
If None, OCR is performed on all pages.
Returns:
List of LangChain Document objects with extracted text.
"""
docs = []
try:
doc = fitz.open(pdf_path)
# Determine which pages to process
if pages_to_ocr is None:
pages_to_process = range(len(doc))
else:
pages_to_process = pages_to_ocr
logger.info(f"Starting OCR extraction for {len(pages_to_process)} pages in {os.path.basename(pdf_path)}")
for page_num in pages_to_process:
if page_num >= len(doc):
logger.warning(f"Page {page_num} out of range for document with {len(doc)} pages")
continue
page = doc.load_page(page_num)
# Convert page to image
# Zoom = 3 (approx 216 dpi) improves accuracy significantly for small text/tables
mat = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
# Preprocessing: Convert to grayscale
image = image.convert('L')
# Optional: Simple thresholding (binarization) can help if contrast is poor
# point_fn = lambda x: 0 if x < 128 else 255
# image = image.point(point_fn, '1')
# Perform OCR
text = pytesseract.image_to_string(image)
# Create Document object
metadata = {
"source": pdf_path,
"page": page_num,
"extraction_method": "ocr"
}
docs.append(Document(page_content=text, metadata=metadata))
doc.close()
logger.info(f"Completed OCR extraction. Generated {len(docs)} documents.")
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
raise e
return docs
|