File size: 9,740 Bytes
ae15dbe
d80fc17
 
 
 
ae15dbe
 
 
 
 
 
 
 
 
d80fc17
ae15dbe
9053089
ae15dbe
d80fc17
ae15dbe
d80fc17
 
ae15dbe
 
 
 
 
 
d80fc17
 
ae15dbe
 
 
d80fc17
 
ae15dbe
d80fc17
 
 
ae15dbe
 
d80fc17
 
ae15dbe
 
 
 
 
d80fc17
ae15dbe
d80fc17
ae15dbe
d80fc17
 
 
ae15dbe
 
 
d80fc17
 
ae15dbe
d80fc17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6388f1a
d80fc17
6388f1a
 
 
 
 
 
 
 
 
d80fc17
6388f1a
d80fc17
 
 
 
 
 
6388f1a
d80fc17
 
 
 
 
 
 
 
 
 
 
49f1d84
 
 
 
 
 
 
 
 
 
 
ae15dbe
d80fc17
49f1d84
 
 
 
d80fc17
 
ae15dbe
d80fc17
ae15dbe
 
 
2f44e71
ae15dbe
 
 
d80fc17
 
ae15dbe
 
cbca9e2
d80fc17
 
ae15dbe
 
2f44e71
ae15dbe
d80fc17
27b73f9
 
d80fc17
 
ae15dbe
d80fc17
ae15dbe
 
 
d80fc17
ae15dbe
d80fc17
ae15dbe
 
d80fc17
ae15dbe
9053089
d80fc17
ae15dbe
d80fc17
 
ae15dbe
d80fc17
 
 
ae15dbe
d80fc17
 
ae15dbe
 
 
 
d80fc17
 
ae15dbe
27b73f9
d80fc17
27b73f9
d80fc17
27b73f9
d80fc17
cbca9e2
 
 
 
 
 
 
 
 
 
 
 
 
 
ae15dbe
 
 
f65b001
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
OCR Text Detection App
======================
Gradio app for OCR text extraction
Features: File upload, URL upload, demo images, confidence filtering
"""

import cv2
import easyocr
import numpy as np
from pathlib import Path
import gradio as gr
import urllib.request
import tempfile
import os
import warnings
from PIL import Image

warnings.filterwarnings('ignore')

# Initialize EasyOCR reader once (reused for all images)
reader = easyocr.Reader(['en'], gpu=False, verbose=False)


def format_text_aligned(results):
    """Format OCR results by grouping text by Y-coordinate (lines) and sorting by X (left-to-right)."""
    if not results:
        return ""
    
    # Extract Y-center and X-min for each detection
    detections = [(sum(p[1] for p in bbox) / len(bbox), min(p[0] for p in bbox), text) for bbox, text, _ in results]
    if not detections:
        return ""
    
    # Calculate threshold to group detections on same line (30% of avg line spacing)
    y_coords = [d[0] for d in detections]
    y_threshold = (max(y_coords) - min(y_coords)) / len(set(int(y) for y in y_coords)) * 0.3
    
    # Sort by Y (top to bottom), then X (left to right)
    detections.sort(key=lambda x: (x[0], x[1]))
    lines, current_line, current_y = [], [], detections[0][0] if detections else 0
    
    # Group detections by similar Y coordinates into lines
    for y, x, text in detections:
        if abs(y - current_y) <= y_threshold:
            current_line.append((x, text))
        else:
            if current_line:
                lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))
            current_line, current_y = [(x, text)], y
    
    if current_line:
        lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))
    
    return '\n'.join(lines)


def process_ocr(input_image, confidence_threshold=0.0):
    """Process image with OCR and return annotated image + formatted text."""
    if input_image is None:
        return None, ""
    
    # Convert PIL Image to numpy array, then to BGR for OpenCV
    if isinstance(input_image, Image.Image):
        input_image = np.array(input_image)
    image_bgr = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
    
    # Perform OCR
    results = reader.readtext(image_bgr)
    
    # Filter by confidence threshold
    filtered_results = [(bbox, text, conf) for bbox, text, conf in results if conf >= confidence_threshold]
    formatted_text = format_text_aligned(filtered_results)
    
    # Draw bounding boxes and labels on image
    annotated_image = image_bgr.copy()
    for bbox, text, confidence in filtered_results:
        # Draw bounding box polygon
        bbox_points = np.array([[int(p[0]), int(p[1])] for p in bbox], dtype=np.int32)
        cv2.polylines(annotated_image, [bbox_points], isClosed=True, color=(0, 255, 0), thickness=2)
        
        # Calculate position for text label (text label size scales with bounding box size)
        x_min, y_min = int(min(p[0] for p in bbox)), int(min(p[1] for p in bbox))
        x_max, y_max = int(max(p[0] for p in bbox)), int(max(p[1] for p in bbox))
        bbox_width = x_max - x_min
        bbox_height = y_max - y_min
        
        # Scale font size based on bounding box dimensions
        font_scale = min(bbox_width, bbox_height) / 100.0
        font_scale = max(0.3, min(font_scale, 1.5))  # Clamp between 0.3 and 1.5
        thickness = max(1, int(font_scale * 2))
        
        label = f"{text} ({confidence:.2f})"
        (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
        
        # Position text above or below box based on Y position
        text_y = y_min - 5 if y_min > 20 else y_min + 20
        
        # Draw background rectangle and text
        cv2.rectangle(annotated_image, (x_min - 2, text_y - h - 2), (x_min + w + 2, text_y + 2), (0, 255, 0), -1)
        cv2.putText(annotated_image, label, (x_min, text_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), thickness)
    
    # Convert back to RGB and then to PIL Image for Gradio 5.x compatibility
    output_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
    output_pil = Image.fromarray(output_rgb)
    return output_pil, formatted_text or ""


# Load sample images for demo gallery
exts = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif')
sample_images = sorted([str(f) for f in Path('images').iterdir() if f.suffix.lower() in exts])[:3]

# CSS for professional styling
css = """
.gradio-container {font-family: 'Segoe UI', sans-serif; max-width: 1400px; margin: 0 auto; overflow-x: hidden;}
body, html {overflow-x: hidden; scrollbar-width: none;}
::-webkit-scrollbar {display: none;}
h1 {text-align: center; color: #042AFF; margin-bottom: 1rem; font-size: 2.5rem; font-weight: bold; letter-spacing: -0.5px;}
.description {text-align: center; color: #6b7280; margin-bottom: 0.3rem; font-size: 1.05rem; line-height: 1.6;}
.credits {text-align: center; color: #f2faf4; margin-bottom: 2rem; margin-top: 0; font-size: 1rem;}
.credits a {color: #042AFF; text-decoration: none; font-weight: bold; transition: color 0.3s ease;}
.credits a:hover {color: #111F68; text-decoration: underline;}
"""

# Create Gradio interface
with gr.Blocks(title="OCR Text Detection", theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("# 📄 OCR Text Detection")
    gr.Markdown("<div class='description'>Extract text from images with bounding boxes and confidence scores. Upload an image or select a demo image to get started.</div>", elem_classes=["description"])
    gr.Markdown("<div class='credits' style='text-align: center;'>Made by <a href='https://techtics.ai' target='_blank' style='color: #042AFF; text-decoration: none; font-weight: bold;'>Techtics.ai</a></div>", elem_classes=["credits"])
    
    # Main layout: Two columns
    with gr.Row():
        # Column 1: Upload area with tabs
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.Tab("Upload File"):
                    image_input = gr.Image(label="Upload Image", height=400, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
                with gr.Tab("Image by URL"):
                    url_input = gr.Textbox(label="Image URL", placeholder="Enter image URL (jpg, png, etc.)", lines=1)
                    url_btn = gr.Button("Load Image from URL", variant="primary")
            
            # Demo images gallery
            if sample_images:
                gr.Markdown("### Demo Images (Click to load)")
                demo_gallery = gr.Gallery(value=sample_images, columns=3, rows=1, height=210, show_label=False, container=True, preview=False, object_fit="scale-down")
        
        # Column 2: Processed image and confidence slider
        with gr.Column(scale=1):
            gr.Markdown("### Processed Image")
            annotated_output = gr.Image(label="", height=400, visible=True, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
            confidence_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Confidence Threshold", info="Filter detections by minimum confidence score")
    
    # Text output below both columns (full width, hidden until processing)
    text_output = gr.Textbox(label="Extracted Text", value="", placeholder="Extracted text will appear here after processing...", lines=12, interactive=True, show_copy_button=True, visible=False)
    
    # Load image from URL
    def load_from_url(url):
        """Download and load image from URL."""
        if not url or not url.strip():
            return None
        try:
            req = urllib.request.Request(url.strip(), headers={'User-Agent': 'Mozilla/5.0'})
            with urllib.request.urlopen(req, timeout=10) as response:
                with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
                    tmp_file.write(response.read())
                    tmp_path = tmp_file.name
            img = Image.open(tmp_path)
            os.unlink(tmp_path)
            return img
        except Exception:
            return None
    
    # Load demo image from gallery
    def load_from_gallery(evt: gr.SelectData):
        """Load demo image when clicked."""
        if evt.index < len(sample_images):
            return Image.open(sample_images[evt.index])
        return None
    
    # Event handlers
    url_btn.click(fn=load_from_url, inputs=url_input, outputs=image_input)
    url_input.submit(fn=load_from_url, inputs=url_input, outputs=image_input)
    if sample_images:
        demo_gallery.select(fn=load_from_gallery, outputs=image_input)
    
    # Process image when it changes or confidence slider changes
    def on_change(img, conf_thresh):
        """Process image and update annotated image + text output."""
        if img is None:
            return None, gr.update(visible=False, value="")
        annot, text = process_ocr(img, conf_thresh)
        return annot, gr.update(visible=True, value=text or "")
    
    # Process image when it changes - store event to allow cancellation
    process_event = image_input.change(
        fn=on_change, 
        inputs=[image_input, confidence_slider], 
        outputs=[annotated_output, text_output]
    )
    
    # Confidence slider cancels previous processing to avoid queue buildup
    confidence_slider.change(
        fn=on_change, 
        inputs=[image_input, confidence_slider], 
        outputs=[annotated_output, text_output],
        cancels=[process_event]  # Cancel previous image processing
    )


if __name__ == "__main__":
    demo.launch()
    # For local testing, use: demo.launch(share=True, server_name="0.0.0.0", server_port=7860)