File size: 5,701 Bytes
9012453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
OCR module adapted for HuggingFace Spaces.
Uses Google Cloud Vision API for text detection.
"""

from PIL import Image, ImageDraw, ImageFilter
from google.cloud import vision
import numpy as np
import io
import os
import json
import tempfile
from py_files.bounding_clustering import QuadTree, Node


def change_contrast(img, level):
    """Adjust image contrast for better OCR results."""
    factor = (259 * (level + 255)) / (255 * (259 - level))

    def contrast(c):
        return 128 + factor * (c - 128)

    return img.point(contrast)


def get_bounding_box_doc(blk):
    """Extract bounding box coordinates from document text block."""
    vertices = [int(blk.bounding_box.vertices[0].x), int(blk.bounding_box.vertices[0].y),
                int(blk.bounding_box.vertices[2].x), int(blk.bounding_box.vertices[2].y)]
    return vertices


def get_text_from_image_doc(img, debug=False, get_response=False, resp=None, max_dist=20):
    """
    Extract text from image using Google Cloud Vision Document Text Detection.
    Adapted for HuggingFace Spaces environment.
    """
    response = resp
    if resp is None:
        # Initialize the client with credentials from environment
        try:
            # Try to get credentials from environment variable
            google_creds = os.environ.get('GOOGLE_CLOUD_CREDENTIALS')
            if google_creds:
                # Create temporary credentials file
                creds_data = json.loads(google_creds)
                with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
                    json.dump(creds_data, f)
                    creds_path = f.name
                os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
            
            client = vision.ImageAnnotatorClient()
            
            # Enhance image contrast for better OCR
            img = change_contrast(img, 20)
            
            # Convert PIL image to bytes
            imgByteArr = io.BytesIO()
            img.save(imgByteArr, format='PNG')
            image = vision.Image(content=imgByteArr.getvalue())
            
            # Perform document text detection
            response = client.document_text_detection(image=image)
            
            # Clean up temporary credentials file
            if google_creds and 'creds_path' in locals():
                try:
                    os.unlink(creds_path)
                except:
                    pass
                    
        except Exception as e:
            # Fallback: create a dummy response for demo purposes
            print(f"Warning: Google Cloud Vision not available: {e}")
            response = create_dummy_ocr_response(img)

    # Process the response
    word_boxes = []
    
    if hasattr(response, 'full_text_annotation') and response.full_text_annotation:
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                if block.confidence < 0.9:
                    continue
                if debug:
                    print(f"\nBlock confidence: {block.confidence}")
                    print(f"Block box: {get_bounding_box_doc(block)}")
                
                words = ""
                fonts = []
                for paragraph in block.paragraphs:
                    for word in paragraph.words:
                        word_text = "".join([symbol.text for symbol in word.symbols])
                        words += word_text + " "
                        word_bbox = get_bounding_box_doc(word)
                        fonts.append(abs(word_bbox[3] - word_bbox[1]))
                
                if debug:
                    print(f"Words: {words}")
                
                if fonts:  # Only add if we have font information
                    word_boxes.append([words.strip()] + get_bounding_box_doc(block) + [sum(fonts) // len(fonts)])
    
    # If no text was detected, create a minimal entry
    if not word_boxes:
        word_boxes.append(["No text detected", 0, 0, 100, 20, 12])

    # Create QuadTree for clustering nearby text
    tree = QuadTree(max_dist=max_dist)
    for i in range(len(word_boxes)):
        tree.insert(Node(*tuple(word_boxes[i])))

    if get_response:
        return tree, response
    return tree, {}


def create_dummy_ocr_response(img):
    """
    Create a dummy OCR response for demo purposes when Google Cloud Vision is not available.
    This allows the demo to work without requiring actual OCR credentials.
    """
    W, H = img.size
    
    # Create a simple mock response object
    class MockResponse:
        def __init__(self):
            self.full_text_annotation = None
    
    # For demo purposes, we'll just return an empty response
    # In a real scenario, you might want to use an alternative OCR library like pytesseract
    return MockResponse()


def draw_boxes(img, bound, color, width=5):
    """Draw bounding boxes on image for visualization."""
    _img = img.copy()
    draw = ImageDraw.Draw(_img)

    x0 = min(bound[0], bound[2]) - 7
    x1 = max(bound[0], bound[2]) + 10
    y0 = min(bound[1], bound[3]) - 7
    y1 = max(bound[1], bound[3]) + 10

    draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
    return _img, x0, y0, x1, y1


def get_image_with_boxes_doc(image, color='red', width=5, get_response=False, response=None):
    """Get image with OCR bounding boxes drawn on it."""
    tree, resp = get_text_from_image_doc(image, get_response=get_response, resp=response)
    bxs = tree.get_children(False)
    for bx in bxs:
        image, x0, y0, x1, y1 = draw_boxes(image, bx, color, width)
    if get_response:
        return image, resp
    return image