Spaces:
Sleeping
Sleeping
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import os | |
| import logging | |
| # Import the Robust Vision logic | |
| # Ensure ocr_preprocessing_engine.py is in the same directory | |
| try: | |
| from ocr_preprocessing_engine import preprocess_image | |
| except ImportError: | |
| # Fail-safe if the module is missing | |
| logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.") | |
| def preprocess_image(img, page_num): return img | |
| logger = logging.getLogger("ocr_engine") | |
| def extract_text_from_file(file_path: str) -> str: | |
| """ | |
| Extracts text using a Hybrid Pipeline: | |
| 1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold). | |
| 2. Fallback to Raw Image if preprocessing yields low/empty confidence. | |
| Ref: Tesseract best practices for DPI and Preprocessing [3], [1]. | |
| """ | |
| if not os.path.exists(file_path): | |
| return "" | |
| text_content = "" | |
| images = [] | |
| try: | |
| # 1. Image Loading & DPI Scaling | |
| # Tesseract works best at 300 DPI [3]. | |
| if file_path.lower().endswith('.pdf'): | |
| try: | |
| images = convert_from_path(file_path, dpi=300) | |
| except Exception as e: | |
| return f"Error reading PDF: {str(e)}" | |
| elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')): | |
| try: | |
| images = [Image.open(file_path)] | |
| except Exception as e: | |
| return f"Error reading image: {str(e)}" | |
| else: | |
| return "Unsupported file format. Please upload PDF or Image." | |
| # 2. Page-by-Page Extraction | |
| for i, raw_img in enumerate(images): | |
| page_num = i + 1 | |
| # Tesseract Configuration | |
| # --psm 4: Assume variable size text (good for single-column invoices) [4] | |
| # --oem 3: Default LSTM engine | |
| custom_config = r'--oem 3 --psm 4' | |
| page_text = "" | |
| # --- STRATEGY A: ROBUST PREPROCESSING --- | |
| try: | |
| # Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6] | |
| processed_img = preprocess_image(raw_img, page_num) | |
| page_text = pytesseract.image_to_string(processed_img, config=custom_config) | |
| except Exception as e: | |
| logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.") | |
| # --- STRATEGY B: FALLBACK MECHANISM --- | |
| # If preprocessing was too aggressive (e.g., thresholding wiped the text), | |
| # rely on Tesseract's internal Otsu binarization [3], [1]. | |
| if len(page_text.strip()) < 10: | |
| logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...") | |
| page_text = pytesseract.image_to_string(raw_img, config=custom_config) | |
| text_content += f"--- Page {page_num} ---\n{page_text}\n" | |
| except Exception as e: | |
| logger.error(f"OCR Critical Error: {e}") | |
| return f"OCR Failed: {str(e)}" | |
| return text_content.strip() |