File size: 3,870 Bytes
e158d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Text extraction from PDFs and images using EasyOCR
Smart extraction: tries text layer first, falls back to OCR
"""

import fitz  # PyMuPDF
import easyocr
from PIL import Image
from pdf2image import convert_from_bytes
import io
import numpy as np
from typing import Tuple, Optional

print("Initializing EasyOCR Reader...")
try:
    reader = easyocr.Reader(['en'], gpu=False, verbose=False)
    print("βœ“ EasyOCR Reader initialized successfully")
except Exception as e:
    print(f"βœ— EasyOCR initialization failed: {e}")
    reader = None

def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
    """
    Extract text from PDF with smart OCR fallback
    
    Returns:
        (extracted_text, ocr_used)
    """
    if not pdf_bytes:
        return None, False
    
    try:
        # Try extracting text layer first (fast)
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = ""
        
        for page in doc:
            full_text += page.get_text()
        
        doc.close()
        
        # Check if meaningful text was extracted
        if len(full_text.strip()) > 50:
            print(f"βœ“ Extracted {len(full_text)} chars from text layer")
            return full_text.strip(), False
        
        # No text layer - use OCR
        print("⚠ No text layer detected, using EasyOCR...")
        text = extract_text_from_pdf_via_ocr(pdf_bytes)
        return text, True
        
    except Exception as e:
        print(f"βœ— Error in PDF text extraction: {e}")
        return None, False

def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
    """
    Extract text using EasyOCR on PDF pages converted to images
    """
    if not reader:
        raise RuntimeError("EasyOCR not initialized")
    
    try:
        # Convert PDF to images
        images = convert_from_bytes(pdf_bytes, dpi=300)
        full_text = ""
        
        for i, image in enumerate(images):
            print(f"   OCR processing page {i+1}/{len(images)}...")
            
            # Convert PIL to numpy array
            img_array = np.array(image)
            
            # Run EasyOCR
            results = reader.readtext(img_array, detail=0, paragraph=True)
            page_text = ' '.join(results)
            full_text += page_text + "\n\n"
        
        print(f"βœ“ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
        return full_text.strip()
        
    except Exception as e:
        print(f"βœ— OCR failed: {e}")
        return None

def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
    """
    Extract text from image file using EasyOCR
    """
    if not reader:
        raise RuntimeError("EasyOCR not initialized")
    
    try:
        print("Processing image with EasyOCR...")
        
        # Open and prepare image
        image = Image.open(io.BytesIO(image_bytes))
        
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Convert to numpy
        img_array = np.array(image)
        
        # Run EasyOCR
        results = reader.readtext(img_array, detail=0, paragraph=True)
        text = ' '.join(results)
        
        print(f"βœ“ EasyOCR extracted {len(text)} chars from image")
        return text.strip()
        
    except Exception as e:
        print(f"βœ— Image OCR failed: {e}")
        return None

def get_ocr_confidence(image_array: np.ndarray) -> list:
    """
    Get detailed OCR results with confidence scores
    """
    if not reader:
        return []
    
    try:
        results = reader.readtext(image_array, detail=1)
        return [
            {
                "text": text,
                "confidence": round(conf, 3),
                "bbox": bbox
            }
            for bbox, text, conf in results
        ]
    except:
        return []