Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

AI_Toolkit / src /core /OCREnhancedPDFLoader.py

NavyDevilDoc

Upload 10 files

c0f31c1 verified 4 months ago

raw

history blame contribute delete

3.36 kB

	import os
	import pytesseract
	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_core.documents import Document
	from pdf2image import convert_from_path

	class OCREnhancedPDFLoader:
	"""Loads PDFs with OCR support for text extraction"""

	BLANK_THRESHOLD = 10

	# FIXED: Removed Windows default path
	def __init__(self, file_path: str, tesseract_path: str = None):
	if not os.path.isfile(file_path):
	raise FileNotFoundError(f"PDF file not found at path: {file_path}")

	self.file_path = file_path
	self.skipped_pages = []

	# Only set cmd if specific path provided, otherwise trust Linux PATH
	if tesseract_path:
	if not os.path.isfile(tesseract_path):
	raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
	pytesseract.pytesseract.tesseract_cmd = tesseract_path

	def _is_blank_page(self, text: str) -> bool:
	if not text or not text.strip():
	return True
	cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
	return len(cleaned_text) < self.BLANK_THRESHOLD

	def _process_page(self, doc, img, page_number: int):
	existing_text = doc.page_content

	# Use existing text if substantial
	if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5:
	combined_text = existing_text
	ocr_used = False
	else:
	# Fallback to OCR
	try:
	ocr_text = pytesseract.image_to_string(img)
	combined_text = ocr_text
	ocr_used = True
	except Exception as e:
	print(f"Error applying OCR to page {page_number}: {e}")
	combined_text = existing_text
	ocr_used = False

	if self._is_blank_page(combined_text):
	self.skipped_pages.append(page_number)
	return None

	return Document(
	page_content=combined_text,
	metadata={
	**doc.metadata,
	"source": "ocr" if ocr_used else "text_extraction",
	"page": page_number,
	"is_blank": "false",
	"has_ocr": str(ocr_used)
	}
	)

	def load(self):
	try:
	# 1. Standard Load
	loader = PyMuPDFLoader(self.file_path)
	text_documents = loader.load()

	# 2. Image Conversion (Linux requires poppler-utils installed)
	images = convert_from_path(self.file_path, dpi=300)

	enhanced_documents = []
	for idx, (doc, img) in enumerate(zip(text_documents, images)):
	page_number = idx + 1
	enhanced_doc = self._process_page(doc, img, page_number)

	if enhanced_doc:
	enhanced_documents.append(enhanced_doc)

	if self.skipped_pages:
	print(f"Skipped blank pages: {self.skipped_pages}")

	return enhanced_documents

	except Exception as e:
	print(f"Error in OCR-enhanced loading: {e}")
	raise