Spaces:

paulstevemithun
/

DocFusion-AI

Sleeping

App Files Files Community

DocFusion-AI / api /core /ocr.py

paulstevemithun

Initial commit

0eec92d verified 4 months ago

raw

history blame contribute delete

3.04 kB


	import os
	import io
	import logging
	from typing import List, Optional
	import fitz # pymupdf
	import pytesseract
	from PIL import Image
	from langchain_core.documents import Document

	# Set up logging
	logger = logging.getLogger(__name__)

	# Configure Tesseract path if needed (Windows usually requires this if not in PATH)
	# If tesseract is in PATH, this might not be needed, but good to have as a fallback or config
	# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

	def extract_text_from_pdf_with_ocr(pdf_path: str, pages_to_ocr: Optional[List[int]] = None) -> List[Document]:
	"""
	Extracts text from a PDF using OCR for specified pages or all pages.

	Args:
	pdf_path: Path to the PDF file.
	pages_to_ocr: List of 0-indexed page numbers to perform OCR on.
	If None, OCR is performed on all pages.

	Returns:
	List of LangChain Document objects with extracted text.
	"""
	docs = []

	try:
	doc = fitz.open(pdf_path)

	# Determine which pages to process
	if pages_to_ocr is None:
	pages_to_process = range(len(doc))
	else:
	pages_to_process = pages_to_ocr

	logger.info(f"Starting OCR extraction for {len(pages_to_process)} pages in {os.path.basename(pdf_path)}")

	for page_num in pages_to_process:
	if page_num >= len(doc):
	logger.warning(f"Page {page_num} out of range for document with {len(doc)} pages")
	continue

	page = doc.load_page(page_num)

	# Convert page to image
	# Zoom = 3 (approx 216 dpi) improves accuracy significantly for small text/tables
	mat = fitz.Matrix(3, 3)
	pix = page.get_pixmap(matrix=mat)

	# Convert to PIL Image
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data))

	# Preprocessing: Convert to grayscale
	image = image.convert('L')

	# Optional: Simple thresholding (binarization) can help if contrast is poor
	# point_fn = lambda x: 0 if x < 128 else 255
	# image = image.point(point_fn, '1')

	# Perform OCR
	text = pytesseract.image_to_string(image)

	# Create Document object
	metadata = {
	"source": pdf_path,
	"page": page_num,
	"extraction_method": "ocr"
	}

	docs.append(Document(page_content=text, metadata=metadata))

	doc.close()
	logger.info(f"Completed OCR extraction. Generated {len(docs)} documents.")

	except Exception as e:
	logger.error(f"OCR extraction failed: {e}")
	raise e

	return docs