Spaces:
Sleeping
Sleeping
| import os | |
| import pytesseract | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain_core.documents import Document | |
| from pdf2image import convert_from_path | |
| class OCREnhancedPDFLoader: | |
| """Loads PDFs with OCR support for text extraction""" | |
| BLANK_THRESHOLD = 10 | |
| # FIXED: Removed Windows default path | |
| def __init__(self, file_path: str, tesseract_path: str = None): | |
| if not os.path.isfile(file_path): | |
| raise FileNotFoundError(f"PDF file not found at path: {file_path}") | |
| self.file_path = file_path | |
| self.skipped_pages = [] | |
| # Only set cmd if specific path provided, otherwise trust Linux PATH | |
| if tesseract_path: | |
| if not os.path.isfile(tesseract_path): | |
| raise ValueError(f"Tesseract executable not found at path: {tesseract_path}") | |
| pytesseract.pytesseract.tesseract_cmd = tesseract_path | |
| def _is_blank_page(self, text: str) -> bool: | |
| if not text or not text.strip(): | |
| return True | |
| cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '') | |
| return len(cleaned_text) < self.BLANK_THRESHOLD | |
| def _process_page(self, doc, img, page_number: int): | |
| existing_text = doc.page_content | |
| # Use existing text if substantial | |
| if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5: | |
| combined_text = existing_text | |
| ocr_used = False | |
| else: | |
| # Fallback to OCR | |
| try: | |
| ocr_text = pytesseract.image_to_string(img) | |
| combined_text = ocr_text | |
| ocr_used = True | |
| except Exception as e: | |
| print(f"Error applying OCR to page {page_number}: {e}") | |
| combined_text = existing_text | |
| ocr_used = False | |
| if self._is_blank_page(combined_text): | |
| self.skipped_pages.append(page_number) | |
| return None | |
| return Document( | |
| page_content=combined_text, | |
| metadata={ | |
| **doc.metadata, | |
| "source": "ocr" if ocr_used else "text_extraction", | |
| "page": page_number, | |
| "is_blank": "false", | |
| "has_ocr": str(ocr_used) | |
| } | |
| ) | |
| def load(self): | |
| try: | |
| # 1. Standard Load | |
| loader = PyMuPDFLoader(self.file_path) | |
| text_documents = loader.load() | |
| # 2. Image Conversion (Linux requires poppler-utils installed) | |
| images = convert_from_path(self.file_path, dpi=300) | |
| enhanced_documents = [] | |
| for idx, (doc, img) in enumerate(zip(text_documents, images)): | |
| page_number = idx + 1 | |
| enhanced_doc = self._process_page(doc, img, page_number) | |
| if enhanced_doc: | |
| enhanced_documents.append(enhanced_doc) | |
| if self.skipped_pages: | |
| print(f"Skipped blank pages: {self.skipped_pages}") | |
| return enhanced_documents | |
| except Exception as e: | |
| print(f"Error in OCR-enhanced loading: {e}") | |
| raise |