import pytesseract from pdf2image import convert_from_path from PIL import Image import os import logging # Import the Robust Vision logic # Ensure ocr_preprocessing_engine.py is in the same directory try: from ocr_preprocessing_engine import preprocess_image except ImportError: # Fail-safe if the module is missing logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.") def preprocess_image(img, page_num): return img logger = logging.getLogger("ocr_engine") def extract_text_from_file(file_path: str) -> str: """ Extracts text using a Hybrid Pipeline: 1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold). 2. Fallback to Raw Image if preprocessing yields low/empty confidence. Ref: Tesseract best practices for DPI and Preprocessing [3], [1]. """ if not os.path.exists(file_path): return "" text_content = "" images = [] try: # 1. Image Loading & DPI Scaling # Tesseract works best at 300 DPI [3]. if file_path.lower().endswith('.pdf'): try: images = convert_from_path(file_path, dpi=300) except Exception as e: return f"Error reading PDF: {str(e)}" elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')): try: images = [Image.open(file_path)] except Exception as e: return f"Error reading image: {str(e)}" else: return "Unsupported file format. Please upload PDF or Image." # 2. Page-by-Page Extraction for i, raw_img in enumerate(images): page_num = i + 1 # Tesseract Configuration # --psm 4: Assume variable size text (good for single-column invoices) [4] # --oem 3: Default LSTM engine custom_config = r'--oem 3 --psm 4' page_text = "" # --- STRATEGY A: ROBUST PREPROCESSING --- try: # Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6] processed_img = preprocess_image(raw_img, page_num) page_text = pytesseract.image_to_string(processed_img, config=custom_config) except Exception as e: logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.") # --- STRATEGY B: FALLBACK MECHANISM --- # If preprocessing was too aggressive (e.g., thresholding wiped the text), # rely on Tesseract's internal Otsu binarization [3], [1]. if len(page_text.strip()) < 10: logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...") page_text = pytesseract.image_to_string(raw_img, config=custom_config) text_content += f"--- Page {page_num} ---\n{page_text}\n" except Exception as e: logger.error(f"OCR Critical Error: {e}") return f"OCR Failed: {str(e)}" return text_content.strip()