Spaces:
Runtime error
Runtime error
File size: 9,740 Bytes
ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe 9053089 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 6388f1a d80fc17 6388f1a d80fc17 6388f1a d80fc17 6388f1a d80fc17 49f1d84 ae15dbe d80fc17 49f1d84 d80fc17 ae15dbe d80fc17 ae15dbe 2f44e71 ae15dbe d80fc17 ae15dbe cbca9e2 d80fc17 ae15dbe 2f44e71 ae15dbe d80fc17 27b73f9 d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe 9053089 d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe d80fc17 ae15dbe 27b73f9 d80fc17 27b73f9 d80fc17 27b73f9 d80fc17 cbca9e2 ae15dbe f65b001 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | """
OCR Text Detection App
======================
Gradio app for OCR text extraction
Features: File upload, URL upload, demo images, confidence filtering
"""
import cv2
import easyocr
import numpy as np
from pathlib import Path
import gradio as gr
import urllib.request
import tempfile
import os
import warnings
from PIL import Image
warnings.filterwarnings('ignore')
# Initialize EasyOCR reader once (reused for all images)
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
def format_text_aligned(results):
"""Format OCR results by grouping text by Y-coordinate (lines) and sorting by X (left-to-right)."""
if not results:
return ""
# Extract Y-center and X-min for each detection
detections = [(sum(p[1] for p in bbox) / len(bbox), min(p[0] for p in bbox), text) for bbox, text, _ in results]
if not detections:
return ""
# Calculate threshold to group detections on same line (30% of avg line spacing)
y_coords = [d[0] for d in detections]
y_threshold = (max(y_coords) - min(y_coords)) / len(set(int(y) for y in y_coords)) * 0.3
# Sort by Y (top to bottom), then X (left to right)
detections.sort(key=lambda x: (x[0], x[1]))
lines, current_line, current_y = [], [], detections[0][0] if detections else 0
# Group detections by similar Y coordinates into lines
for y, x, text in detections:
if abs(y - current_y) <= y_threshold:
current_line.append((x, text))
else:
if current_line:
lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))
current_line, current_y = [(x, text)], y
if current_line:
lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))
return '\n'.join(lines)
def process_ocr(input_image, confidence_threshold=0.0):
"""Process image with OCR and return annotated image + formatted text."""
if input_image is None:
return None, ""
# Convert PIL Image to numpy array, then to BGR for OpenCV
if isinstance(input_image, Image.Image):
input_image = np.array(input_image)
image_bgr = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
# Perform OCR
results = reader.readtext(image_bgr)
# Filter by confidence threshold
filtered_results = [(bbox, text, conf) for bbox, text, conf in results if conf >= confidence_threshold]
formatted_text = format_text_aligned(filtered_results)
# Draw bounding boxes and labels on image
annotated_image = image_bgr.copy()
for bbox, text, confidence in filtered_results:
# Draw bounding box polygon
bbox_points = np.array([[int(p[0]), int(p[1])] for p in bbox], dtype=np.int32)
cv2.polylines(annotated_image, [bbox_points], isClosed=True, color=(0, 255, 0), thickness=2)
# Calculate position for text label (text label size scales with bounding box size)
x_min, y_min = int(min(p[0] for p in bbox)), int(min(p[1] for p in bbox))
x_max, y_max = int(max(p[0] for p in bbox)), int(max(p[1] for p in bbox))
bbox_width = x_max - x_min
bbox_height = y_max - y_min
# Scale font size based on bounding box dimensions
font_scale = min(bbox_width, bbox_height) / 100.0
font_scale = max(0.3, min(font_scale, 1.5)) # Clamp between 0.3 and 1.5
thickness = max(1, int(font_scale * 2))
label = f"{text} ({confidence:.2f})"
(w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
# Position text above or below box based on Y position
text_y = y_min - 5 if y_min > 20 else y_min + 20
# Draw background rectangle and text
cv2.rectangle(annotated_image, (x_min - 2, text_y - h - 2), (x_min + w + 2, text_y + 2), (0, 255, 0), -1)
cv2.putText(annotated_image, label, (x_min, text_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), thickness)
# Convert back to RGB and then to PIL Image for Gradio 5.x compatibility
output_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
output_pil = Image.fromarray(output_rgb)
return output_pil, formatted_text or ""
# Load sample images for demo gallery
exts = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif')
sample_images = sorted([str(f) for f in Path('images').iterdir() if f.suffix.lower() in exts])[:3]
# CSS for professional styling
css = """
.gradio-container {font-family: 'Segoe UI', sans-serif; max-width: 1400px; margin: 0 auto; overflow-x: hidden;}
body, html {overflow-x: hidden; scrollbar-width: none;}
::-webkit-scrollbar {display: none;}
h1 {text-align: center; color: #042AFF; margin-bottom: 1rem; font-size: 2.5rem; font-weight: bold; letter-spacing: -0.5px;}
.description {text-align: center; color: #6b7280; margin-bottom: 0.3rem; font-size: 1.05rem; line-height: 1.6;}
.credits {text-align: center; color: #f2faf4; margin-bottom: 2rem; margin-top: 0; font-size: 1rem;}
.credits a {color: #042AFF; text-decoration: none; font-weight: bold; transition: color 0.3s ease;}
.credits a:hover {color: #111F68; text-decoration: underline;}
"""
# Create Gradio interface
with gr.Blocks(title="OCR Text Detection", theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# 📄 OCR Text Detection")
gr.Markdown("<div class='description'>Extract text from images with bounding boxes and confidence scores. Upload an image or select a demo image to get started.</div>", elem_classes=["description"])
gr.Markdown("<div class='credits' style='text-align: center;'>Made by <a href='https://techtics.ai' target='_blank' style='color: #042AFF; text-decoration: none; font-weight: bold;'>Techtics.ai</a></div>", elem_classes=["credits"])
# Main layout: Two columns
with gr.Row():
# Column 1: Upload area with tabs
with gr.Column(scale=1):
with gr.Tabs():
with gr.Tab("Upload File"):
image_input = gr.Image(label="Upload Image", height=400, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
with gr.Tab("Image by URL"):
url_input = gr.Textbox(label="Image URL", placeholder="Enter image URL (jpg, png, etc.)", lines=1)
url_btn = gr.Button("Load Image from URL", variant="primary")
# Demo images gallery
if sample_images:
gr.Markdown("### Demo Images (Click to load)")
demo_gallery = gr.Gallery(value=sample_images, columns=3, rows=1, height=210, show_label=False, container=True, preview=False, object_fit="scale-down")
# Column 2: Processed image and confidence slider
with gr.Column(scale=1):
gr.Markdown("### Processed Image")
annotated_output = gr.Image(label="", height=400, visible=True, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
confidence_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Confidence Threshold", info="Filter detections by minimum confidence score")
# Text output below both columns (full width, hidden until processing)
text_output = gr.Textbox(label="Extracted Text", value="", placeholder="Extracted text will appear here after processing...", lines=12, interactive=True, show_copy_button=True, visible=False)
# Load image from URL
def load_from_url(url):
"""Download and load image from URL."""
if not url or not url.strip():
return None
try:
req = urllib.request.Request(url.strip(), headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req, timeout=10) as response:
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
tmp_file.write(response.read())
tmp_path = tmp_file.name
img = Image.open(tmp_path)
os.unlink(tmp_path)
return img
except Exception:
return None
# Load demo image from gallery
def load_from_gallery(evt: gr.SelectData):
"""Load demo image when clicked."""
if evt.index < len(sample_images):
return Image.open(sample_images[evt.index])
return None
# Event handlers
url_btn.click(fn=load_from_url, inputs=url_input, outputs=image_input)
url_input.submit(fn=load_from_url, inputs=url_input, outputs=image_input)
if sample_images:
demo_gallery.select(fn=load_from_gallery, outputs=image_input)
# Process image when it changes or confidence slider changes
def on_change(img, conf_thresh):
"""Process image and update annotated image + text output."""
if img is None:
return None, gr.update(visible=False, value="")
annot, text = process_ocr(img, conf_thresh)
return annot, gr.update(visible=True, value=text or "")
# Process image when it changes - store event to allow cancellation
process_event = image_input.change(
fn=on_change,
inputs=[image_input, confidence_slider],
outputs=[annotated_output, text_output]
)
# Confidence slider cancels previous processing to avoid queue buildup
confidence_slider.change(
fn=on_change,
inputs=[image_input, confidence_slider],
outputs=[annotated_output, text_output],
cancels=[process_event] # Cancel previous image processing
)
if __name__ == "__main__":
demo.launch()
# For local testing, use: demo.launch(share=True, server_name="0.0.0.0", server_port=7860) |