| """ |
| Visualization utilities for drawing text detection boxes on images |
| """ |
| import numpy as np |
| from PIL import Image, ImageDraw, ImageFont |
| from typing import List, Dict, Tuple |
| import os |
| import math |
|
|
| def generate_random_color() -> Tuple[int, int, int]: |
| """ |
| Generate a random color for bounding boxes |
| |
| Returns: |
| RGB color tuple |
| """ |
| return ( |
| np.random.randint(0, 200), |
| np.random.randint(0, 200), |
| np.random.randint(0, 255) |
| ) |
|
|
|
|
| def draw_detection_boxes( |
| image: Image.Image, |
| detections: List[Dict], |
| box_width: int = 2, |
| font_size: int = 12, |
| show_text: bool = True, |
| merge_boxes: bool = True |
| ) -> Image.Image: |
| """ |
| Draw text detection boxes with labels on image |
| |
| Args: |
| image: PIL Image to draw on |
| detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2' |
| box_width: Width of bounding box lines |
| font_size: Font size for text labels |
| show_text: Whether to show text labels |
| merge_boxes: Whether to merge close boxes (default: True) |
| |
| Returns: |
| New image with boxes and labels drawn |
| """ |
| |
| if merge_boxes: |
| detections = merge_detections(detections) |
|
|
| |
| img_draw = image.copy().convert('RGBA') |
| |
| |
| overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0)) |
| draw_overlay = ImageDraw.Draw(overlay) |
| draw = ImageDraw.Draw(img_draw) |
| |
| |
| |
| font_paths = [ |
| |
| os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"), |
| os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"), |
| os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"), |
| |
| "/System/Library/Fonts/STHeiti Light.ttc", |
| "/System/Library/Fonts/PingFang.ttc", |
| "/System/Library/Fonts/Hiragino Sans GB.ttc", |
| |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc", |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", |
| "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc" |
| ] |
| |
| font = None |
| valid_font_path = None |
| for path in font_paths: |
| try: |
| font = ImageFont.truetype(path, font_size) |
| valid_font_path = path |
| break |
| except (IOError, OSError): |
| continue |
| |
| if font is None: |
| |
| font = ImageFont.load_default() |
| |
| |
| for i, detection in enumerate(detections, 1): |
| try: |
| text = detection['text'] |
| x1, y1 = detection['x1'], detection['y1'] |
| x2, y2 = detection['x2'], detection['y2'] |
| |
| |
| box_w = x2 - x1 |
| box_h = y2 - y1 |
| |
| |
| def get_text_layout(text, font, max_width): |
| lines = [] |
| raw_lines = text.split('\n') |
| for raw_line in raw_lines: |
| current_line = "" |
| for char in raw_line: |
| test_line = current_line + char |
| bbox = draw.textbbox((0, 0), test_line, font=font) |
| if bbox[2] - bbox[0] < max_width: |
| current_line = test_line |
| else: |
| if current_line: |
| lines.append(current_line) |
| current_line = char |
| if current_line: |
| lines.append(current_line) |
| |
| |
| if not lines: |
| return [], 0, 0 |
| |
| |
| ascent, descent = font.getmetrics() |
| line_height = ascent + descent |
| total_height = len(lines) * line_height * 1.2 |
| |
| max_line_w = 0 |
| for line in lines: |
| bbox = draw.textbbox((0, 0), line, font=font) |
| max_line_w = max(max_line_w, bbox[2] - bbox[0]) |
| |
| return lines, total_height, max_line_w |
|
|
| |
| font_size_to_use = 12 |
|
|
| try: |
| if valid_font_path: |
| font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use) |
| else: |
| font_to_use = ImageFont.load_default() |
| except: |
| font_to_use = ImageFont.load_default() |
| |
| |
| max_allowed_w = int(box_w * 1.2) |
| max_allowed_h = int(box_h * 1.2) |
| |
| |
| |
| lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8) |
| |
| |
| |
| new_w = max(box_w, min(max_line_w + 8, max_allowed_w)) |
| new_h = max(box_h, min(total_h + 4, max_allowed_h)) |
| |
| |
| x2 = x1 + new_w |
| y2 = y1 + new_h |
| box_w = new_w |
| box_h = new_h |
|
|
| |
| draw.rectangle( |
| [x1, y1, x2, y2], |
| fill=(255, 250, 240), |
| outline=None |
| ) |
| |
| |
| |
| ascent, descent = font_to_use.getmetrics() |
| line_height = (ascent + descent) * 1.2 |
| |
| start_y = y1 + (box_h - total_h) / 2 |
| |
| for j, line in enumerate(lines): |
| |
| line_x = x1 + 4 |
| line_y = start_y + j * line_height |
| |
| |
| text_color = (150, 0, 0) |
| draw.text((line_x, line_y), line, font=font_to_use, fill=text_color) |
| |
| except Exception as e: |
| print(f"Error drawing detection box: {str(e)}") |
| continue |
| |
| except Exception as e: |
| print(f"Error drawing detection box: {str(e)}") |
| continue |
| |
| |
| img_draw.paste(overlay, (0, 0), overlay) |
| |
| |
| return img_draw.convert('RGB') |
|
|
|
|
| def create_side_by_side_comparison( |
| original: Image.Image, |
| annotated: Image.Image, |
| spacing: int = 20 |
| ) -> Image.Image: |
| """ |
| Create side-by-side comparison of original and annotated images |
| |
| Args: |
| original: Original image |
| annotated: Annotated image with boxes |
| spacing: Space between images in pixels |
| |
| Returns: |
| Combined image showing both versions |
| """ |
| |
| width1, height1 = original.size |
| width2, height2 = annotated.size |
| |
| |
| total_width = width1 + width2 + spacing |
| total_height = max(height1, height2) |
| |
| combined = Image.new('RGB', (total_width, total_height), (255, 255, 255)) |
| |
| |
| combined.paste(original, (0, 0)) |
| combined.paste(annotated, (width1 + spacing, 0)) |
| |
| |
| draw = ImageDraw.Draw(combined) |
| |
| |
| font_paths = [ |
| "/System/Library/Fonts/PingFang.ttc", |
| "/System/Library/Fonts/Hiragino Sans GB.ttc", |
| "/System/Library/Fonts/STHeiti Light.ttc", |
| "/System/Library/Fonts/Supplemental/Arial Unicode.ttf", |
| "/System/Library/Fonts/Supplemental/Arial.ttf", |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" |
| ] |
| |
| font = None |
| for path in font_paths: |
| try: |
| font = ImageFont.truetype(path, 24) |
| break |
| except (IOError, OSError): |
| continue |
| |
| if font is None: |
| font = ImageFont.load_default() |
| |
| draw.text((10, 10), "Original", font=font, fill=(0, 0, 0)) |
| draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0)) |
| |
| return combined |
|
|
|
|
| def get_detection_summary(detections: List[Dict]) -> str: |
| """ |
| Create a text summary of detection results |
| |
| Args: |
| detections: List of detection dictionaries |
| |
| Returns: |
| Formatted summary string |
| """ |
| if not detections: |
| return "No text detected in the image." |
| |
| summary = f"Detected {len(detections)} text region(s):\n\n" |
| |
| for i, det in enumerate(detections, 1): |
| if 'original_text' in det and det['original_text'] != det['text']: |
| summary += f"{i}. Original: \"{det['original_text']}\"\n" |
| summary += f" Translated: \"{det['text']}\"\n" |
| else: |
| summary += f"{i}. \"{det['text']}\"\n" |
| summary += f" Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n" |
| |
| return summary |
|
|
|
|
| def merge_detections(detections: List[Dict], threshold: int = 30) -> List[Dict]: |
| """ |
| Merge close detection boxes into single boxes |
| |
| Args: |
| detections: List of detection dicts |
| threshold: Distance threshold for merging |
| |
| Returns: |
| List of merged detection dicts |
| """ |
| if not detections: |
| return [] |
| |
| |
| def are_close(box1, box2, thresh): |
| |
| b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh |
| b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh |
| |
| |
| return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or |
| b1_y2 < box2['y1'] or b1_y1 > box2['y2']) |
|
|
| |
| n = len(detections) |
| adj = [[] for _ in range(n)] |
| for i in range(n): |
| for j in range(i + 1, n): |
| if are_close(detections[i], detections[j], threshold): |
| adj[i].append(j) |
| adj[j].append(i) |
| |
| |
| visited = [False] * n |
| merged_results = [] |
| |
| for i in range(n): |
| if not visited[i]: |
| |
| component = [] |
| stack = [i] |
| visited[i] = True |
| while stack: |
| curr = stack.pop() |
| component.append(detections[curr]) |
| for neighbor in adj[curr]: |
| if not visited[neighbor]: |
| visited[neighbor] = True |
| stack.append(neighbor) |
| |
| |
| if not component: |
| continue |
| |
| |
| min_x1 = min(d['x1'] for d in component) |
| min_y1 = min(d['y1'] for d in component) |
| max_x2 = max(d['x2'] for d in component) |
| max_y2 = max(d['y2'] for d in component) |
| |
| |
| |
| component.sort(key=lambda d: (-d['x1'], d['y1'])) |
| |
| merged_text = "".join(d['text'] for d in component).replace(" ", "") |
| |
| merged_results.append({ |
| 'text': merged_text, |
| 'x1': min_x1, |
| 'y1': min_y1, |
| 'x2': max_x2, |
| 'y2': max_y2 |
| }) |
| |
| return merged_results |
|
|