Spaces:
Sleeping
Sleeping
File size: 3,356 Bytes
c0f31c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import os
import pytesseract
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from pdf2image import convert_from_path
class OCREnhancedPDFLoader:
"""Loads PDFs with OCR support for text extraction"""
BLANK_THRESHOLD = 10
# FIXED: Removed Windows default path
def __init__(self, file_path: str, tesseract_path: str = None):
if not os.path.isfile(file_path):
raise FileNotFoundError(f"PDF file not found at path: {file_path}")
self.file_path = file_path
self.skipped_pages = []
# Only set cmd if specific path provided, otherwise trust Linux PATH
if tesseract_path:
if not os.path.isfile(tesseract_path):
raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
pytesseract.pytesseract.tesseract_cmd = tesseract_path
def _is_blank_page(self, text: str) -> bool:
if not text or not text.strip():
return True
cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
return len(cleaned_text) < self.BLANK_THRESHOLD
def _process_page(self, doc, img, page_number: int):
existing_text = doc.page_content
# Use existing text if substantial
if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5:
combined_text = existing_text
ocr_used = False
else:
# Fallback to OCR
try:
ocr_text = pytesseract.image_to_string(img)
combined_text = ocr_text
ocr_used = True
except Exception as e:
print(f"Error applying OCR to page {page_number}: {e}")
combined_text = existing_text
ocr_used = False
if self._is_blank_page(combined_text):
self.skipped_pages.append(page_number)
return None
return Document(
page_content=combined_text,
metadata={
**doc.metadata,
"source": "ocr" if ocr_used else "text_extraction",
"page": page_number,
"is_blank": "false",
"has_ocr": str(ocr_used)
}
)
def load(self):
try:
# 1. Standard Load
loader = PyMuPDFLoader(self.file_path)
text_documents = loader.load()
# 2. Image Conversion (Linux requires poppler-utils installed)
images = convert_from_path(self.file_path, dpi=300)
enhanced_documents = []
for idx, (doc, img) in enumerate(zip(text_documents, images)):
page_number = idx + 1
enhanced_doc = self._process_page(doc, img, page_number)
if enhanced_doc:
enhanced_documents.append(enhanced_doc)
if self.skipped_pages:
print(f"Skipped blank pages: {self.skipped_pages}")
return enhanced_documents
except Exception as e:
print(f"Error in OCR-enhanced loading: {e}")
raise |