File size: 3,088 Bytes
dc79584
 
 
 
 
006541d
 
 
 
 
 
 
 
 
 
dc79584
006541d
dc79584
 
006541d
 
 
 
 
dc79584
 
 
 
 
006541d
 
dc79584
006541d
 
dc79584
 
006541d
dc79584
 
 
 
006541d
dc79584
 
 
 
 
006541d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc79584
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import logging

# Import the Robust Vision logic
# Ensure ocr_preprocessing_engine.py is in the same directory
try:
    from ocr_preprocessing_engine import preprocess_image
except ImportError:
    # Fail-safe if the module is missing
    logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.")
    def preprocess_image(img, page_num): return img 

logger = logging.getLogger("ocr_engine")

def extract_text_from_file(file_path: str) -> str:
    """
    Extracts text using a Hybrid Pipeline: 
    1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold).
    2. Fallback to Raw Image if preprocessing yields low/empty confidence.
    
    Ref: Tesseract best practices for DPI and Preprocessing [3], [1].
    """
    if not os.path.exists(file_path):
        return ""

    text_content = ""
    images = []

    try:
        # 1. Image Loading & DPI Scaling
        # Tesseract works best at 300 DPI [3]. 
        if file_path.lower().endswith('.pdf'):
            try:
                images = convert_from_path(file_path, dpi=300)
            except Exception as e:
                return f"Error reading PDF: {str(e)}"
        elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
            try:
                images = [Image.open(file_path)]
            except Exception as e:
                return f"Error reading image: {str(e)}"
        else:
            return "Unsupported file format. Please upload PDF or Image."

        # 2. Page-by-Page Extraction
        for i, raw_img in enumerate(images):
            page_num = i + 1
            
            # Tesseract Configuration
            # --psm 4: Assume variable size text (good for single-column invoices) [4]
            # --oem 3: Default LSTM engine
            custom_config = r'--oem 3 --psm 4'

            page_text = ""
            
            # --- STRATEGY A: ROBUST PREPROCESSING ---
            try:
                # Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6]
                processed_img = preprocess_image(raw_img, page_num)
                page_text = pytesseract.image_to_string(processed_img, config=custom_config)
            except Exception as e:
                logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.")

            # --- STRATEGY B: FALLBACK MECHANISM ---
            # If preprocessing was too aggressive (e.g., thresholding wiped the text),
            # rely on Tesseract's internal Otsu binarization [3], [1].
            if len(page_text.strip()) < 10:
                logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...")
                page_text = pytesseract.image_to_string(raw_img, config=custom_config)

            text_content += f"--- Page {page_num} ---\n{page_text}\n"

    except Exception as e:
        logger.error(f"OCR Critical Error: {e}")
        return f"OCR Failed: {str(e)}"

    return text_content.strip()