Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import logging | |
| from typing import List, Optional | |
| import fitz # pymupdf | |
| import pytesseract | |
| from PIL import Image | |
| from langchain_core.documents import Document | |
| # Set up logging | |
| logger = logging.getLogger(__name__) | |
| # Configure Tesseract path if needed (Windows usually requires this if not in PATH) | |
| # If tesseract is in PATH, this might not be needed, but good to have as a fallback or config | |
| # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
| def extract_text_from_pdf_with_ocr(pdf_path: str, pages_to_ocr: Optional[List[int]] = None) -> List[Document]: | |
| """ | |
| Extracts text from a PDF using OCR for specified pages or all pages. | |
| Args: | |
| pdf_path: Path to the PDF file. | |
| pages_to_ocr: List of 0-indexed page numbers to perform OCR on. | |
| If None, OCR is performed on all pages. | |
| Returns: | |
| List of LangChain Document objects with extracted text. | |
| """ | |
| docs = [] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| # Determine which pages to process | |
| if pages_to_ocr is None: | |
| pages_to_process = range(len(doc)) | |
| else: | |
| pages_to_process = pages_to_ocr | |
| logger.info(f"Starting OCR extraction for {len(pages_to_process)} pages in {os.path.basename(pdf_path)}") | |
| for page_num in pages_to_process: | |
| if page_num >= len(doc): | |
| logger.warning(f"Page {page_num} out of range for document with {len(doc)} pages") | |
| continue | |
| page = doc.load_page(page_num) | |
| # Convert page to image | |
| # Zoom = 3 (approx 216 dpi) improves accuracy significantly for small text/tables | |
| mat = fitz.Matrix(3, 3) | |
| pix = page.get_pixmap(matrix=mat) | |
| # Convert to PIL Image | |
| img_data = pix.tobytes("png") | |
| image = Image.open(io.BytesIO(img_data)) | |
| # Preprocessing: Convert to grayscale | |
| image = image.convert('L') | |
| # Optional: Simple thresholding (binarization) can help if contrast is poor | |
| # point_fn = lambda x: 0 if x < 128 else 255 | |
| # image = image.point(point_fn, '1') | |
| # Perform OCR | |
| text = pytesseract.image_to_string(image) | |
| # Create Document object | |
| metadata = { | |
| "source": pdf_path, | |
| "page": page_num, | |
| "extraction_method": "ocr" | |
| } | |
| docs.append(Document(page_content=text, metadata=metadata)) | |
| doc.close() | |
| logger.info(f"Completed OCR extraction. Generated {len(docs)} documents.") | |
| except Exception as e: | |
| logger.error(f"OCR extraction failed: {e}") | |
| raise e | |
| return docs | |