Spaces:
Sleeping
Sleeping
| import cv2 | |
| import numpy as np | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import os | |
| import logging | |
| import tempfile # Added for safe path handling | |
| logger = logging.getLogger("ocr_preprocessor") | |
| # Toggle to save images for debugging | |
| DEBUG_SAVE_IMAGES = True | |
| def preprocess_image(image: Image.Image, page_num: int) -> Image.Image: | |
| """ | |
| Applies the preprocessing steps for OCR enhancement | |
| 1. Normalization (Contrast Stretching) | |
| 2. Denoising (Gaussian Blur) | |
| 3. Deskewing (Rotation Correction) | |
| 4. Thresholding (Binarization) | |
| Saves debug images to the system temp directory to avoid permission errors in HF Spaces. | |
| """ | |
| # 1. Convert to Grayscale | |
| img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | |
| # 2. Normalization | |
| norm_img = np.zeros(img_cv.shape, dtype=np.uint8) | |
| img_cv = cv2.normalize(img_cv, norm_img, 0, 255, cv2.NORM_MINMAX) | |
| # 3. Denoising (3x3 kernel) | |
| denoised = cv2.GaussianBlur(img_cv, (3, 3), 0) | |
| # 4. Adaptive Thresholding | |
| binary = cv2.adaptiveThreshold( | |
| denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY, 11, 2 | |
| ) | |
| # 5. Deskewing (Inverted) | |
| inverted_binary = cv2.bitwise_not(binary) | |
| coords = np.column_stack(np.where(inverted_binary > 0)) | |
| if coords.size > 0: | |
| angle = cv2.minAreaRect(coords)[-1] | |
| if angle < -45: angle = -(90 + angle) | |
| else: angle = -angle | |
| if abs(angle) > 0.5: | |
| (h, w) = binary.shape[:2] | |
| center = (w // 2, h // 2) | |
| M = cv2.getRotationMatrix2D(center, angle, 1.0) | |
| binary = cv2.warpAffine( | |
| binary, M, (w, h), | |
| flags=cv2.INTER_CUBIC, | |
| borderMode=cv2.BORDER_REPLICATE | |
| ) | |
| # --- DEBUG SAVING LOGIC --- | |
| if DEBUG_SAVE_IMAGES: | |
| try: | |
| # Use the system temp directory (/tmp in Linux/HF Spaces) | |
| temp_dir = tempfile.gettempdir() | |
| debug_filename = f"debug_page_{page_num}_processed.png" | |
| debug_path = os.path.join(temp_dir, debug_filename) | |
| cv2.imwrite(debug_path, binary) | |
| logger.info(f"Debug image saved to: {debug_path}") | |
| except Exception as e: | |
| logger.warning(f"Could not save debug image: {e}") | |
| # -------------------------- | |
| return Image.fromarray(binary) | |
| def extract_text_with_preprocessing(file_path: str) -> str: | |
| """ | |
| Pipeline: PDF -> 300 DPI Image -> Preprocessing -> Tesseract | |
| Converts PDF to 300 DPI images, pre-processes them, | |
| and runs Tesseract with layout preservation. | |
| """ | |
| if not os.path.exists(file_path): | |
| return "" | |
| text_content = "" | |
| try: | |
| if file_path.lower().endswith('.pdf'): | |
| # Convert PDF to images at 300 DPI [2] | |
| images = convert_from_path(file_path, dpi=300) | |
| else: | |
| images = [Image.open(file_path)] | |
| for i, raw_img in enumerate(images): | |
| custom_config = r'--oem 3 --psm 4' | |
| # Try Robust Preprocessing | |
| try: | |
| processed_img = preprocess_image(raw_img, i+1) | |
| page_text = pytesseract.image_to_string(processed_img, config=custom_config) | |
| except Exception as e: | |
| logger.warning(f"Preprocessing failed: {e}") | |
| page_text = "" | |
| # Fallback to Raw Image if preprocessing fails or yields empty text [3] | |
| if len(page_text.strip()) < 10: | |
| logger.warning(f"Page {i+1}: Low confidence. Retrying with raw image.") | |
| page_text = pytesseract.image_to_string(raw_img, config=custom_config) | |
| text_content += f"--- Page {i+1} ---\n{page_text}\n" | |
| except Exception as e: | |
| logger.error(f"OCR Pipeline Error: {e}") | |
| return f"Error processing file: {str(e)}" | |
| return text_content.strip() |