Spaces:
Running
Running
test workflow
Browse files- ComicPanelExtractor.py +239 -0
- requirements.txt +4 -0
- text_detector.py +148 -0
ComicPanelExtractor.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from text_detector import TextDetector, Config as CVP_Config
|
| 5 |
+
import cv2
|
| 6 |
+
import shutil
|
| 7 |
+
|
| 8 |
+
# ----------------------------------------------------------
|
| 9 |
+
# MASK TEXT REGIONS
|
| 10 |
+
# ----------------------------------------------------------
|
| 11 |
+
|
| 12 |
+
def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
|
| 13 |
+
"""
|
| 14 |
+
Make the text regions in an image white (or given color) to reduce panel extraction noise.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
image_path (str): Path to the input image.
|
| 18 |
+
bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
|
| 19 |
+
output_path (str, optional): Path to save the modified image.
|
| 20 |
+
color (tuple): Color to fill the bounding boxes (default black).
|
| 21 |
+
Returns:
|
| 22 |
+
masked_image (numpy array): Image with masked text regions.
|
| 23 |
+
"""
|
| 24 |
+
image = cv2.imread(image_path)
|
| 25 |
+
if image is None:
|
| 26 |
+
raise Exception(f"Could not load image: {image_path}")
|
| 27 |
+
|
| 28 |
+
for bbox in bboxes:
|
| 29 |
+
x1, y1, x2, y2 = bbox
|
| 30 |
+
cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1) # Fill rectangle
|
| 31 |
+
|
| 32 |
+
if output_path:
|
| 33 |
+
cv2.imwrite(output_path, image)
|
| 34 |
+
print(f"✅ Text-masked image saved to: {output_path}")
|
| 35 |
+
|
| 36 |
+
return image
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ----------------------------------------------------------
|
| 40 |
+
# PRE PROCESS METHOD
|
| 41 |
+
# ----------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def pre_process(image_path, output_dir):
|
| 44 |
+
if not os.path.exists(output_dir):
|
| 45 |
+
os.makedirs(output_dir)
|
| 46 |
+
|
| 47 |
+
# Load and preprocess image
|
| 48 |
+
image = cv2.imread(image_path)
|
| 49 |
+
if image is None:
|
| 50 |
+
raise Exception(f"Could not load image: {image_path}")
|
| 51 |
+
|
| 52 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 53 |
+
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
| 54 |
+
|
| 55 |
+
# Dilate to strengthen borders and fill small gaps
|
| 56 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
| 57 |
+
dilated = cv2.dilate(binary, kernel, iterations=2)
|
| 58 |
+
|
| 59 |
+
cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
|
| 60 |
+
cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
|
| 61 |
+
cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ----------------------------------------------------------
|
| 65 |
+
# CLEAN DILATED IMAGE
|
| 66 |
+
# ----------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
|
| 69 |
+
"""
|
| 70 |
+
Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
|
| 71 |
+
with preference to clean rows that have fewer black pixels.
|
| 72 |
+
"""
|
| 73 |
+
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 74 |
+
if dilated is None:
|
| 75 |
+
raise Exception("Could not load dilated image.")
|
| 76 |
+
|
| 77 |
+
binary = (dilated == 0).astype(np.uint8)
|
| 78 |
+
padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
|
| 79 |
+
cleaned = binary.copy()
|
| 80 |
+
|
| 81 |
+
height, width = binary.shape
|
| 82 |
+
row_black_counts = np.sum(binary, axis=1)
|
| 83 |
+
|
| 84 |
+
for y in range(1, height + 1):
|
| 85 |
+
for x in range(1, width + 1):
|
| 86 |
+
if padded[y, x] == 1:
|
| 87 |
+
neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
|
| 88 |
+
if neighbors > max_neighbors:
|
| 89 |
+
neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
|
| 90 |
+
if neighbor_rows:
|
| 91 |
+
row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
|
| 92 |
+
if y == row_to_clear:
|
| 93 |
+
cleaned[y-1, x-1] = 0
|
| 94 |
+
|
| 95 |
+
cleaned_img = (1 - cleaned) * 255
|
| 96 |
+
cv2.imwrite(output_path, cleaned_img)
|
| 97 |
+
print(f"✅ Cleaned dilated image saved to: {output_path}")
|
| 98 |
+
return output_path
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ----------------------------------------------------------
|
| 102 |
+
# EXTRACT PANELS - BLACK PERCENTAGE METHOD
|
| 103 |
+
# ----------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
def extract_panels_by_black_percentage_fixed(
|
| 106 |
+
dilated_path, original_image_path, output_dir,
|
| 107 |
+
row_thresh=20, col_thresh=20,
|
| 108 |
+
min_width_ratio=0.1, min_height_ratio=0.1
|
| 109 |
+
):
|
| 110 |
+
"""
|
| 111 |
+
Extract comic panels using black percentage scan with smart width & height filtering.
|
| 112 |
+
"""
|
| 113 |
+
if not os.path.exists(output_dir):
|
| 114 |
+
os.makedirs(output_dir)
|
| 115 |
+
|
| 116 |
+
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 117 |
+
original = cv2.imread(original_image_path)
|
| 118 |
+
if dilated is None or original is None:
|
| 119 |
+
raise Exception("Could not load dilated or original image.")
|
| 120 |
+
|
| 121 |
+
height, width = dilated.shape
|
| 122 |
+
visual_output = original.copy()
|
| 123 |
+
|
| 124 |
+
# Detect row gutters
|
| 125 |
+
row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
|
| 126 |
+
row_gutters, panel_rows = [], []
|
| 127 |
+
in_gutter = False
|
| 128 |
+
for y, percent_black in enumerate(row_black_percentage):
|
| 129 |
+
if percent_black >= row_thresh and not in_gutter:
|
| 130 |
+
start_row = y
|
| 131 |
+
in_gutter = True
|
| 132 |
+
elif percent_black < row_thresh and in_gutter:
|
| 133 |
+
end_row = y
|
| 134 |
+
row_gutters.append((start_row, end_row))
|
| 135 |
+
in_gutter = False
|
| 136 |
+
|
| 137 |
+
prev_end = 0
|
| 138 |
+
for start, end in row_gutters:
|
| 139 |
+
if start - prev_end > 10:
|
| 140 |
+
panel_rows.append((prev_end, start))
|
| 141 |
+
prev_end = end
|
| 142 |
+
if height - prev_end > 10:
|
| 143 |
+
panel_rows.append((prev_end, height))
|
| 144 |
+
|
| 145 |
+
# Extract panels
|
| 146 |
+
all_panels, panel_count, panel_images, panel_points = [], 0, [], []
|
| 147 |
+
for y1, y2 in panel_rows:
|
| 148 |
+
row_slice = dilated[y1:y2, :]
|
| 149 |
+
col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
|
| 150 |
+
col_gutters, panel_cols = [], []
|
| 151 |
+
in_gutter_col = False
|
| 152 |
+
for x, percent_black in enumerate(col_black_percentage):
|
| 153 |
+
if percent_black >= col_thresh and not in_gutter_col:
|
| 154 |
+
start_col = x
|
| 155 |
+
in_gutter_col = True
|
| 156 |
+
elif percent_black < col_thresh and in_gutter_col:
|
| 157 |
+
end_col = x
|
| 158 |
+
col_gutters.append((start_col, end_col))
|
| 159 |
+
in_gutter_col = False
|
| 160 |
+
|
| 161 |
+
prev_end_col = 0
|
| 162 |
+
for start, end in col_gutters:
|
| 163 |
+
if start - prev_end_col > 10:
|
| 164 |
+
panel_cols.append((prev_end_col, start))
|
| 165 |
+
prev_end_col = end
|
| 166 |
+
if width - prev_end_col > 10:
|
| 167 |
+
panel_cols.append((prev_end_col, width))
|
| 168 |
+
|
| 169 |
+
for x1, x2 in panel_cols:
|
| 170 |
+
w, h = x2 - x1, y2 - y1
|
| 171 |
+
if w * h < (width * height) * 0.005:
|
| 172 |
+
continue
|
| 173 |
+
all_panels.append((x1, y1, x2, y2))
|
| 174 |
+
|
| 175 |
+
# Post-filter
|
| 176 |
+
panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
|
| 177 |
+
panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
|
| 178 |
+
avg_width = np.mean(panel_widths) if panel_widths else 0
|
| 179 |
+
avg_height = np.mean(panel_heights) if panel_heights else 0
|
| 180 |
+
min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
|
| 181 |
+
min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
|
| 182 |
+
|
| 183 |
+
for x1, y1, x2, y2 in all_panels:
|
| 184 |
+
panel_width, panel_height = x2 - x1, y2 - y1
|
| 185 |
+
if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
|
| 186 |
+
panel = original[y1:y2, x1:x2]
|
| 187 |
+
panel_count += 1
|
| 188 |
+
panel_images.append(panel)
|
| 189 |
+
panel_points.append({
|
| 190 |
+
"x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
|
| 191 |
+
})
|
| 192 |
+
panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
|
| 193 |
+
cv2.imwrite(panel_path, panel)
|
| 194 |
+
cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 195 |
+
cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
|
| 196 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 197 |
+
|
| 198 |
+
print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
|
| 199 |
+
return output_dir, panel_images, panel_points
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ----------------------------------------------------------
|
| 203 |
+
# MAIN EXECUTION
|
| 204 |
+
# ----------------------------------------------------------
|
| 205 |
+
if __name__ == "__main__":
|
| 206 |
+
image_path = "input.jpg"
|
| 207 |
+
output_dir = "extracted_panels"
|
| 208 |
+
shutil.rmtree(output_dir, ignore_errors=True)
|
| 209 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 210 |
+
|
| 211 |
+
# Detect and mask text regions
|
| 212 |
+
cvp_config = CVP_Config()
|
| 213 |
+
cvp_config.main_file_name = image_path
|
| 214 |
+
cvp_config.temp_folder = output_dir
|
| 215 |
+
cvp_config.comic_image = image_path
|
| 216 |
+
cvp_config.output_video = f"{output_dir}/test.mp4"
|
| 217 |
+
|
| 218 |
+
with TextDetector(cvp_config) as text_detector:
|
| 219 |
+
bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
|
| 220 |
+
with open(bubbles_path, "r", encoding="utf-8") as f:
|
| 221 |
+
bubbles = json.load(f)
|
| 222 |
+
|
| 223 |
+
output_path = os.path.join(output_dir, "1_text_removed.jpg")
|
| 224 |
+
masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
|
| 225 |
+
|
| 226 |
+
pre_process(output_path, output_dir)
|
| 227 |
+
|
| 228 |
+
# Clean dilated image
|
| 229 |
+
dilated_path = os.path.join(output_dir, "4_dilated.jpg")
|
| 230 |
+
cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
|
| 231 |
+
clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
|
| 232 |
+
|
| 233 |
+
# Extract panels - black percentage
|
| 234 |
+
extract_panels_by_black_percentage_fixed(
|
| 235 |
+
cleaned_dilated_path,
|
| 236 |
+
image_path,
|
| 237 |
+
output_dir,
|
| 238 |
+
min_width_ratio=0.1, # Panels must be at least 10% of total width
|
| 239 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
moviepy==1.0.3
|
| 2 |
+
numpy
|
| 3 |
+
opencv-python
|
| 4 |
+
easyocr
|
text_detector.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import List, Tuple, Optional
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from moviepy.editor import *
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class Config:
|
| 11 |
+
"""Configuration settings for the comic-to-video pipeline."""
|
| 12 |
+
main_file_name: str = ""
|
| 13 |
+
comic_image: str = ""
|
| 14 |
+
temp_folder: str = ""
|
| 15 |
+
distance_threshold: int = 70
|
| 16 |
+
vertical_threshold: int = 30
|
| 17 |
+
tts_engine: str = "chatterbox"
|
| 18 |
+
resolution: Tuple[int, int] = (1920, 1080)
|
| 19 |
+
margin_ratio: float = 0.08
|
| 20 |
+
auto_scroll: bool = True
|
| 21 |
+
zoom_enabled: bool = False
|
| 22 |
+
zoom_factor: float = 1.1
|
| 23 |
+
output_video: str = "comic_text.mp4"
|
| 24 |
+
min_text_length: int = 2
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class TextDetection:
|
| 29 |
+
"""Represents a detected text region."""
|
| 30 |
+
bbox: List[int]
|
| 31 |
+
text: str
|
| 32 |
+
confidence: float
|
| 33 |
+
id: Optional[int] = None
|
| 34 |
+
|
| 35 |
+
class TextDetector:
|
| 36 |
+
"""Handles text detection and grouping from comic images."""
|
| 37 |
+
|
| 38 |
+
def __init__(self, config: Config):
|
| 39 |
+
self.config = config
|
| 40 |
+
|
| 41 |
+
def load(self):
|
| 42 |
+
import easyocr
|
| 43 |
+
self.reader = easyocr.Reader(['en'])
|
| 44 |
+
|
| 45 |
+
def detect_text(self, image_path: str) -> List[TextDetection]:
|
| 46 |
+
"""Detect text regions in the image."""
|
| 47 |
+
self.load()
|
| 48 |
+
results = self.reader.readtext(image_path)
|
| 49 |
+
print(f"EasyOCR found {len(results)} raw detections")
|
| 50 |
+
|
| 51 |
+
detections = []
|
| 52 |
+
for box, text, confidence in results:
|
| 53 |
+
bbox = [
|
| 54 |
+
min(x[0] for x in box),
|
| 55 |
+
min(x[1] for x in box),
|
| 56 |
+
max(x[0] for x in box),
|
| 57 |
+
max(x[1] for x in box)
|
| 58 |
+
]
|
| 59 |
+
detections.append(TextDetection(
|
| 60 |
+
bbox=bbox,
|
| 61 |
+
text=text.strip(),
|
| 62 |
+
confidence=float(confidence)
|
| 63 |
+
))
|
| 64 |
+
|
| 65 |
+
return detections
|
| 66 |
+
|
| 67 |
+
@staticmethod
|
| 68 |
+
def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
|
| 69 |
+
"""Calculate Euclidean distance between two bounding box centers."""
|
| 70 |
+
center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
|
| 71 |
+
center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
|
| 72 |
+
return np.linalg.norm(np.subtract(center1, center2))
|
| 73 |
+
|
| 74 |
+
def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
|
| 75 |
+
"""Group nearby text regions into speech bubbles."""
|
| 76 |
+
# Filter out single character detections
|
| 77 |
+
filtered_detections = [
|
| 78 |
+
det for det in detections
|
| 79 |
+
if len(det.text.strip()) >= self.config.min_text_length
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
# Sort by vertical position (top to bottom)
|
| 83 |
+
filtered_detections.sort(key=lambda d: d.bbox[1])
|
| 84 |
+
|
| 85 |
+
groups = []
|
| 86 |
+
for detection in filtered_detections:
|
| 87 |
+
added_to_group = False
|
| 88 |
+
|
| 89 |
+
for group in groups:
|
| 90 |
+
if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
|
| 91 |
+
# Merge with existing group
|
| 92 |
+
group.text += " " + detection.text
|
| 93 |
+
group.bbox = [
|
| 94 |
+
min(group.bbox[0], detection.bbox[0]),
|
| 95 |
+
min(group.bbox[1], detection.bbox[1]),
|
| 96 |
+
max(group.bbox[2], detection.bbox[2]),
|
| 97 |
+
max(group.bbox[3], detection.bbox[3])
|
| 98 |
+
]
|
| 99 |
+
added_to_group = True
|
| 100 |
+
break
|
| 101 |
+
|
| 102 |
+
if not added_to_group:
|
| 103 |
+
groups.append(detection)
|
| 104 |
+
|
| 105 |
+
# Sort groups by vertical position and assign IDs
|
| 106 |
+
groups.sort(key=lambda g: g.bbox[1])
|
| 107 |
+
for idx, group in enumerate(groups):
|
| 108 |
+
group.id = idx + 1
|
| 109 |
+
|
| 110 |
+
return groups
|
| 111 |
+
|
| 112 |
+
def detect_and_group_text(self, image_path: str) -> str:
|
| 113 |
+
"""Main method to detect and group text, saving results to JSON."""
|
| 114 |
+
|
| 115 |
+
# Save to JSON
|
| 116 |
+
output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
|
| 117 |
+
if not os.path.exists(output_path):
|
| 118 |
+
detections = self.detect_text(image_path)
|
| 119 |
+
groups = self.group_text_regions(detections)
|
| 120 |
+
groups_data = []
|
| 121 |
+
for group in groups:
|
| 122 |
+
groups_data.append({
|
| 123 |
+
"id": group.id,
|
| 124 |
+
"bbox": [int(x) for x in group.bbox],
|
| 125 |
+
"text": group.text,
|
| 126 |
+
"confidence": group.confidence
|
| 127 |
+
})
|
| 128 |
+
|
| 129 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 130 |
+
json.dump(groups_data, f, indent=2, ensure_ascii=False)
|
| 131 |
+
|
| 132 |
+
print(f"Grouped bubbles saved: {output_path}")
|
| 133 |
+
return str(output_path)
|
| 134 |
+
|
| 135 |
+
def cleanup(self):
|
| 136 |
+
try:
|
| 137 |
+
del self.reader
|
| 138 |
+
|
| 139 |
+
except: pass
|
| 140 |
+
|
| 141 |
+
def __enter__(self):
|
| 142 |
+
return self
|
| 143 |
+
|
| 144 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 145 |
+
self.cleanup()
|
| 146 |
+
|
| 147 |
+
def __del__(self):
|
| 148 |
+
self.cleanup()
|