Spaces:
Running
Running
mv as pck
Browse files- .gitignore +1 -0
- ComicPanelExtractor.py +0 -239
- comic_panel_extractor/__init__.py +16 -0
- comic_panel_extractor/cli.py +88 -0
- comic_panel_extractor/config.py +11 -0
- comic_panel_extractor/image_processor.py +84 -0
- comic_panel_extractor/main.py +57 -0
- comic_panel_extractor/panel_extractor.py +184 -0
- comic_panel_extractor/text_detector.py +144 -0
- setup.py +39 -0
- text_detector.py +0 -148
.gitignore
CHANGED
|
@@ -205,3 +205,4 @@ cython_debug/
|
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
__marimo__/
|
|
|
|
|
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
__marimo__/
|
| 208 |
+
temp_dir
|
ComicPanelExtractor.py
DELETED
|
@@ -1,239 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import os
|
| 3 |
-
import json
|
| 4 |
-
from text_detector import TextDetector, Config as CVP_Config
|
| 5 |
-
import cv2
|
| 6 |
-
import shutil
|
| 7 |
-
|
| 8 |
-
# ----------------------------------------------------------
|
| 9 |
-
# MASK TEXT REGIONS
|
| 10 |
-
# ----------------------------------------------------------
|
| 11 |
-
|
| 12 |
-
def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
|
| 13 |
-
"""
|
| 14 |
-
Make the text regions in an image white (or given color) to reduce panel extraction noise.
|
| 15 |
-
|
| 16 |
-
Args:
|
| 17 |
-
image_path (str): Path to the input image.
|
| 18 |
-
bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
|
| 19 |
-
output_path (str, optional): Path to save the modified image.
|
| 20 |
-
color (tuple): Color to fill the bounding boxes (default black).
|
| 21 |
-
Returns:
|
| 22 |
-
masked_image (numpy array): Image with masked text regions.
|
| 23 |
-
"""
|
| 24 |
-
image = cv2.imread(image_path)
|
| 25 |
-
if image is None:
|
| 26 |
-
raise Exception(f"Could not load image: {image_path}")
|
| 27 |
-
|
| 28 |
-
for bbox in bboxes:
|
| 29 |
-
x1, y1, x2, y2 = bbox
|
| 30 |
-
cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1) # Fill rectangle
|
| 31 |
-
|
| 32 |
-
if output_path:
|
| 33 |
-
cv2.imwrite(output_path, image)
|
| 34 |
-
print(f"✅ Text-masked image saved to: {output_path}")
|
| 35 |
-
|
| 36 |
-
return image
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
# ----------------------------------------------------------
|
| 40 |
-
# PRE PROCESS METHOD
|
| 41 |
-
# ----------------------------------------------------------
|
| 42 |
-
|
| 43 |
-
def pre_process(image_path, output_dir):
|
| 44 |
-
if not os.path.exists(output_dir):
|
| 45 |
-
os.makedirs(output_dir)
|
| 46 |
-
|
| 47 |
-
# Load and preprocess image
|
| 48 |
-
image = cv2.imread(image_path)
|
| 49 |
-
if image is None:
|
| 50 |
-
raise Exception(f"Could not load image: {image_path}")
|
| 51 |
-
|
| 52 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 53 |
-
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
| 54 |
-
|
| 55 |
-
# Dilate to strengthen borders and fill small gaps
|
| 56 |
-
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
| 57 |
-
dilated = cv2.dilate(binary, kernel, iterations=2)
|
| 58 |
-
|
| 59 |
-
cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
|
| 60 |
-
cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
|
| 61 |
-
cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# ----------------------------------------------------------
|
| 65 |
-
# CLEAN DILATED IMAGE
|
| 66 |
-
# ----------------------------------------------------------
|
| 67 |
-
|
| 68 |
-
def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
|
| 69 |
-
"""
|
| 70 |
-
Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
|
| 71 |
-
with preference to clean rows that have fewer black pixels.
|
| 72 |
-
"""
|
| 73 |
-
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 74 |
-
if dilated is None:
|
| 75 |
-
raise Exception("Could not load dilated image.")
|
| 76 |
-
|
| 77 |
-
binary = (dilated == 0).astype(np.uint8)
|
| 78 |
-
padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
|
| 79 |
-
cleaned = binary.copy()
|
| 80 |
-
|
| 81 |
-
height, width = binary.shape
|
| 82 |
-
row_black_counts = np.sum(binary, axis=1)
|
| 83 |
-
|
| 84 |
-
for y in range(1, height + 1):
|
| 85 |
-
for x in range(1, width + 1):
|
| 86 |
-
if padded[y, x] == 1:
|
| 87 |
-
neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
|
| 88 |
-
if neighbors > max_neighbors:
|
| 89 |
-
neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
|
| 90 |
-
if neighbor_rows:
|
| 91 |
-
row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
|
| 92 |
-
if y == row_to_clear:
|
| 93 |
-
cleaned[y-1, x-1] = 0
|
| 94 |
-
|
| 95 |
-
cleaned_img = (1 - cleaned) * 255
|
| 96 |
-
cv2.imwrite(output_path, cleaned_img)
|
| 97 |
-
print(f"✅ Cleaned dilated image saved to: {output_path}")
|
| 98 |
-
return output_path
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
# ----------------------------------------------------------
|
| 102 |
-
# EXTRACT PANELS - BLACK PERCENTAGE METHOD
|
| 103 |
-
# ----------------------------------------------------------
|
| 104 |
-
|
| 105 |
-
def extract_panels_by_black_percentage_fixed(
|
| 106 |
-
dilated_path, original_image_path, output_dir,
|
| 107 |
-
row_thresh=20, col_thresh=20,
|
| 108 |
-
min_width_ratio=0.1, min_height_ratio=0.1
|
| 109 |
-
):
|
| 110 |
-
"""
|
| 111 |
-
Extract comic panels using black percentage scan with smart width & height filtering.
|
| 112 |
-
"""
|
| 113 |
-
if not os.path.exists(output_dir):
|
| 114 |
-
os.makedirs(output_dir)
|
| 115 |
-
|
| 116 |
-
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 117 |
-
original = cv2.imread(original_image_path)
|
| 118 |
-
if dilated is None or original is None:
|
| 119 |
-
raise Exception("Could not load dilated or original image.")
|
| 120 |
-
|
| 121 |
-
height, width = dilated.shape
|
| 122 |
-
visual_output = original.copy()
|
| 123 |
-
|
| 124 |
-
# Detect row gutters
|
| 125 |
-
row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
|
| 126 |
-
row_gutters, panel_rows = [], []
|
| 127 |
-
in_gutter = False
|
| 128 |
-
for y, percent_black in enumerate(row_black_percentage):
|
| 129 |
-
if percent_black >= row_thresh and not in_gutter:
|
| 130 |
-
start_row = y
|
| 131 |
-
in_gutter = True
|
| 132 |
-
elif percent_black < row_thresh and in_gutter:
|
| 133 |
-
end_row = y
|
| 134 |
-
row_gutters.append((start_row, end_row))
|
| 135 |
-
in_gutter = False
|
| 136 |
-
|
| 137 |
-
prev_end = 0
|
| 138 |
-
for start, end in row_gutters:
|
| 139 |
-
if start - prev_end > 10:
|
| 140 |
-
panel_rows.append((prev_end, start))
|
| 141 |
-
prev_end = end
|
| 142 |
-
if height - prev_end > 10:
|
| 143 |
-
panel_rows.append((prev_end, height))
|
| 144 |
-
|
| 145 |
-
# Extract panels
|
| 146 |
-
all_panels, panel_count, panel_images, panel_points = [], 0, [], []
|
| 147 |
-
for y1, y2 in panel_rows:
|
| 148 |
-
row_slice = dilated[y1:y2, :]
|
| 149 |
-
col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
|
| 150 |
-
col_gutters, panel_cols = [], []
|
| 151 |
-
in_gutter_col = False
|
| 152 |
-
for x, percent_black in enumerate(col_black_percentage):
|
| 153 |
-
if percent_black >= col_thresh and not in_gutter_col:
|
| 154 |
-
start_col = x
|
| 155 |
-
in_gutter_col = True
|
| 156 |
-
elif percent_black < col_thresh and in_gutter_col:
|
| 157 |
-
end_col = x
|
| 158 |
-
col_gutters.append((start_col, end_col))
|
| 159 |
-
in_gutter_col = False
|
| 160 |
-
|
| 161 |
-
prev_end_col = 0
|
| 162 |
-
for start, end in col_gutters:
|
| 163 |
-
if start - prev_end_col > 10:
|
| 164 |
-
panel_cols.append((prev_end_col, start))
|
| 165 |
-
prev_end_col = end
|
| 166 |
-
if width - prev_end_col > 10:
|
| 167 |
-
panel_cols.append((prev_end_col, width))
|
| 168 |
-
|
| 169 |
-
for x1, x2 in panel_cols:
|
| 170 |
-
w, h = x2 - x1, y2 - y1
|
| 171 |
-
if w * h < (width * height) * 0.005:
|
| 172 |
-
continue
|
| 173 |
-
all_panels.append((x1, y1, x2, y2))
|
| 174 |
-
|
| 175 |
-
# Post-filter
|
| 176 |
-
panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
|
| 177 |
-
panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
|
| 178 |
-
avg_width = np.mean(panel_widths) if panel_widths else 0
|
| 179 |
-
avg_height = np.mean(panel_heights) if panel_heights else 0
|
| 180 |
-
min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
|
| 181 |
-
min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
|
| 182 |
-
|
| 183 |
-
for x1, y1, x2, y2 in all_panels:
|
| 184 |
-
panel_width, panel_height = x2 - x1, y2 - y1
|
| 185 |
-
if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
|
| 186 |
-
panel = original[y1:y2, x1:x2]
|
| 187 |
-
panel_count += 1
|
| 188 |
-
panel_images.append(panel)
|
| 189 |
-
panel_points.append({
|
| 190 |
-
"x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
|
| 191 |
-
})
|
| 192 |
-
panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
|
| 193 |
-
cv2.imwrite(panel_path, panel)
|
| 194 |
-
cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 195 |
-
cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
|
| 196 |
-
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 197 |
-
|
| 198 |
-
print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
|
| 199 |
-
return output_dir, panel_images, panel_points
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
# ----------------------------------------------------------
|
| 203 |
-
# MAIN EXECUTION
|
| 204 |
-
# ----------------------------------------------------------
|
| 205 |
-
if __name__ == "__main__":
|
| 206 |
-
image_path = "input.jpg"
|
| 207 |
-
output_dir = "extracted_panels"
|
| 208 |
-
shutil.rmtree(output_dir, ignore_errors=True)
|
| 209 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 210 |
-
|
| 211 |
-
# Detect and mask text regions
|
| 212 |
-
cvp_config = CVP_Config()
|
| 213 |
-
cvp_config.main_file_name = image_path
|
| 214 |
-
cvp_config.temp_folder = output_dir
|
| 215 |
-
cvp_config.comic_image = image_path
|
| 216 |
-
cvp_config.output_video = f"{output_dir}/test.mp4"
|
| 217 |
-
|
| 218 |
-
with TextDetector(cvp_config) as text_detector:
|
| 219 |
-
bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
|
| 220 |
-
with open(bubbles_path, "r", encoding="utf-8") as f:
|
| 221 |
-
bubbles = json.load(f)
|
| 222 |
-
|
| 223 |
-
output_path = os.path.join(output_dir, "1_text_removed.jpg")
|
| 224 |
-
masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
|
| 225 |
-
|
| 226 |
-
pre_process(output_path, output_dir)
|
| 227 |
-
|
| 228 |
-
# Clean dilated image
|
| 229 |
-
dilated_path = os.path.join(output_dir, "4_dilated.jpg")
|
| 230 |
-
cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
|
| 231 |
-
clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
|
| 232 |
-
|
| 233 |
-
# Extract panels - black percentage
|
| 234 |
-
extract_panels_by_black_percentage_fixed(
|
| 235 |
-
cleaned_dilated_path,
|
| 236 |
-
image_path,
|
| 237 |
-
output_dir,
|
| 238 |
-
min_width_ratio=0.1, # Panels must be at least 10% of total width
|
| 239 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
comic_panel_extractor/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .main import ComicPanelExtractor
|
| 2 |
+
from .config import Config
|
| 3 |
+
from .text_detector import TextDetector, TextDetection
|
| 4 |
+
from .image_processor import ImageProcessor
|
| 5 |
+
from .panel_extractor import PanelExtractor, PanelData
|
| 6 |
+
|
| 7 |
+
__version__ = "0.1.0"
|
| 8 |
+
__all__ = [
|
| 9 |
+
"ComicPanelExtractor",
|
| 10 |
+
"Config",
|
| 11 |
+
"TextDetector",
|
| 12 |
+
"TextDetection",
|
| 13 |
+
"ImageProcessor",
|
| 14 |
+
"PanelExtractor",
|
| 15 |
+
"PanelData"
|
| 16 |
+
]
|
comic_panel_extractor/cli.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Command-line interface for comic panel extraction.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import sys
|
| 8 |
+
import json
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
|
| 11 |
+
from .main import ComicPanelExtractor
|
| 12 |
+
from .config import Config
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ComicPanelCLI:
|
| 16 |
+
"""Command-line interface for comic panel extraction."""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.parser = self._create_parser()
|
| 20 |
+
|
| 21 |
+
def _create_parser(self) -> argparse.ArgumentParser:
|
| 22 |
+
"""Create argument parser."""
|
| 23 |
+
parser = argparse.ArgumentParser(
|
| 24 |
+
prog="comic-extract",
|
| 25 |
+
description="Extract panels from comic book images using OCR and image processing",
|
| 26 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 27 |
+
epilog="""
|
| 28 |
+
Examples:
|
| 29 |
+
comic-extract comic.jpg
|
| 30 |
+
comic-extract comic.jpg --config config.json
|
| 31 |
+
"""
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Required arguments
|
| 35 |
+
parser.add_argument(
|
| 36 |
+
"input_path",
|
| 37 |
+
help="Path to the comic image file"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Configuration file
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--config",
|
| 43 |
+
help="Path to JSON configuration file"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
return parser
|
| 47 |
+
|
| 48 |
+
def run(self, args: Optional[List[str]] = None) -> int:
|
| 49 |
+
"""Main CLI entry point."""
|
| 50 |
+
try:
|
| 51 |
+
parsed_args = self.parser.parse_args(args)
|
| 52 |
+
# Load configuration
|
| 53 |
+
config = self._load_config(parsed_args)
|
| 54 |
+
ComicPanelExtractor(config).extract_panels_from_comic()
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Error: {e}", file=sys.stderr)
|
| 57 |
+
return 1
|
| 58 |
+
|
| 59 |
+
def _load_config(self, args: argparse.Namespace) -> Config:
|
| 60 |
+
"""Load configuration from file or create from arguments."""
|
| 61 |
+
config = Config()
|
| 62 |
+
|
| 63 |
+
# Load from config file if provided
|
| 64 |
+
if args.config:
|
| 65 |
+
try:
|
| 66 |
+
with open(args.config, 'r', encoding='utf-8') as f:
|
| 67 |
+
config_data = json.load(f)
|
| 68 |
+
for key, value in config_data.items():
|
| 69 |
+
if hasattr(config, key):
|
| 70 |
+
setattr(config, key, value)
|
| 71 |
+
if args.verbose:
|
| 72 |
+
print(f"📄 Loaded configuration from: {args.config}")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"⚠️ Warning: Could not load config file: {e}", file=sys.stderr)
|
| 75 |
+
|
| 76 |
+
# Override with command line arguments
|
| 77 |
+
config.input_path = args.input_path
|
| 78 |
+
|
| 79 |
+
return config
|
| 80 |
+
|
| 81 |
+
def main():
|
| 82 |
+
"""Main entry point for CLI."""
|
| 83 |
+
cli = ComicPanelCLI()
|
| 84 |
+
sys.exit(cli.run())
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
comic_panel_extractor/config.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
@dataclass
|
| 4 |
+
class Config:
|
| 5 |
+
"""Configuration settings for the comic-to-video pipeline."""
|
| 6 |
+
input_path: str = ""
|
| 7 |
+
output_folder: str = "temp_dir"
|
| 8 |
+
distance_threshold: int = 70
|
| 9 |
+
vertical_threshold: int = 30
|
| 10 |
+
text_cood_path: str = f"{output_folder}/detect_and_group_text.json"
|
| 11 |
+
min_text_length: int = 2
|
comic_panel_extractor/image_processor.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from .config import Config
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import cv2
|
| 7 |
+
|
| 8 |
+
class ImageProcessor:
|
| 9 |
+
"""Handles image preprocessing operations."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, config: Config):
|
| 12 |
+
self.config = config
|
| 13 |
+
|
| 14 |
+
def mask_text_regions(self, bboxes: List[List[int]], output_filename: str = "1_text_removed.jpg", color: Tuple[int, int, int] = (0, 0, 0)) -> str:
|
| 15 |
+
"""Mask text regions in the image to reduce panel extraction noise."""
|
| 16 |
+
image = cv2.imread(self.config.input_path)
|
| 17 |
+
if image is None:
|
| 18 |
+
raise FileNotFoundError(f"Could not load image: {self.config.input_path}")
|
| 19 |
+
|
| 20 |
+
for bbox in bboxes:
|
| 21 |
+
x1, y1, x2, y2 = bbox
|
| 22 |
+
cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1)
|
| 23 |
+
|
| 24 |
+
output_path = f'{self.config.output_folder}/{output_filename}'
|
| 25 |
+
cv2.imwrite(output_path, image)
|
| 26 |
+
print(f"✅ Text-masked image saved to: {output_path}")
|
| 27 |
+
return str(output_path)
|
| 28 |
+
|
| 29 |
+
def preprocess_image(self, masked_image_path) -> Tuple[str, str, str]:
|
| 30 |
+
"""Preprocess image for panel extraction."""
|
| 31 |
+
image = cv2.imread(masked_image_path)
|
| 32 |
+
if image is None:
|
| 33 |
+
raise FileNotFoundError(f"Could not load image: {masked_image_path}")
|
| 34 |
+
|
| 35 |
+
# Convert to grayscale and binary
|
| 36 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 37 |
+
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
| 38 |
+
|
| 39 |
+
# Dilate to strengthen borders
|
| 40 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
| 41 |
+
dilated = cv2.dilate(binary, kernel, iterations=2)
|
| 42 |
+
|
| 43 |
+
# Save intermediate results
|
| 44 |
+
gray_path = f'{self.config.output_folder}/2_gray.jpg'
|
| 45 |
+
binary_path = f'{self.config.output_folder}/3_binary.jpg'
|
| 46 |
+
dilated_path = f'{self.config.output_folder}/4_dilated.jpg'
|
| 47 |
+
|
| 48 |
+
cv2.imwrite(str(gray_path), gray)
|
| 49 |
+
cv2.imwrite(str(binary_path), binary)
|
| 50 |
+
cv2.imwrite(str(dilated_path), dilated)
|
| 51 |
+
|
| 52 |
+
return str(gray_path), str(binary_path), str(dilated_path)
|
| 53 |
+
|
| 54 |
+
def clean_dilated_image(self, dilated_path: str,
|
| 55 |
+
output_filename: str = "5_dilated_cleaned.jpg",
|
| 56 |
+
max_neighbors: int = 2) -> str:
|
| 57 |
+
"""Clean dilated image by thinning thick borders."""
|
| 58 |
+
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 59 |
+
if dilated is None:
|
| 60 |
+
raise FileNotFoundError(f"Could not load dilated image: {dilated_path}")
|
| 61 |
+
|
| 62 |
+
binary = (dilated == 0).astype(np.uint8)
|
| 63 |
+
padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
|
| 64 |
+
cleaned = binary.copy()
|
| 65 |
+
|
| 66 |
+
height, width = binary.shape
|
| 67 |
+
row_black_counts = np.sum(binary, axis=1)
|
| 68 |
+
|
| 69 |
+
for y in range(1, height + 1):
|
| 70 |
+
for x in range(1, width + 1):
|
| 71 |
+
if padded[y, x] == 1:
|
| 72 |
+
neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
|
| 73 |
+
if neighbors > max_neighbors:
|
| 74 |
+
neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
|
| 75 |
+
if neighbor_rows:
|
| 76 |
+
row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
|
| 77 |
+
if y == row_to_clear:
|
| 78 |
+
cleaned[y-1, x-1] = 0
|
| 79 |
+
|
| 80 |
+
cleaned_img = (1 - cleaned) * 255
|
| 81 |
+
output_path = f'{self.config.output_folder}/{output_filename}'
|
| 82 |
+
cv2.imwrite(str(output_path), cleaned_img)
|
| 83 |
+
print(f"✅ Cleaned dilated image saved to: {output_path}")
|
| 84 |
+
return str(output_path)
|
comic_panel_extractor/main.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .text_detector import TextDetector
|
| 2 |
+
from .config import Config
|
| 3 |
+
from .image_processor import ImageProcessor
|
| 4 |
+
from .panel_extractor import PanelData
|
| 5 |
+
from .panel_extractor import PanelExtractor
|
| 6 |
+
|
| 7 |
+
from typing import List, Tuple
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import numpy as np
|
| 10 |
+
import json
|
| 11 |
+
import shutil
|
| 12 |
+
|
| 13 |
+
class ComicPanelExtractor:
|
| 14 |
+
"""Main class that orchestrates the comic panel extraction process."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, config: Config):
|
| 17 |
+
self.config = config
|
| 18 |
+
if Path(self.config.output_folder).exists():
|
| 19 |
+
shutil.rmtree(self.config.output_folder)
|
| 20 |
+
Path(self.config.output_folder).mkdir(exist_ok=True)
|
| 21 |
+
|
| 22 |
+
self.image_processor = ImageProcessor(self.config)
|
| 23 |
+
self.panel_extractor = PanelExtractor(self.config)
|
| 24 |
+
|
| 25 |
+
def extract_panels_from_comic(self) -> Tuple[List[np.ndarray], List[PanelData]]:
|
| 26 |
+
"""Complete pipeline to extract panels from a comic image."""
|
| 27 |
+
print(f"Starting panel extraction for: {self.config.input_path}")
|
| 28 |
+
|
| 29 |
+
# Step 1: Detect and mask text regions
|
| 30 |
+
text_bubbles = self._detect_text_bubbles()
|
| 31 |
+
masked_image_path = self.image_processor.mask_text_regions([bubble["bbox"] for bubble in text_bubbles])
|
| 32 |
+
|
| 33 |
+
# Step 2: Preprocess image
|
| 34 |
+
_, _, dilated_path = self.image_processor.preprocess_image(masked_image_path)
|
| 35 |
+
|
| 36 |
+
# Step 3: Clean dilated image
|
| 37 |
+
cleaned_path = self.image_processor.clean_dilated_image(dilated_path)
|
| 38 |
+
|
| 39 |
+
# Step 4: Extract panels
|
| 40 |
+
panel_images, panel_data = self.panel_extractor.extract_panels(
|
| 41 |
+
cleaned_path, min_width_ratio=0.1
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
return panel_images, panel_data
|
| 45 |
+
|
| 46 |
+
def _detect_text_bubbles(self) -> List[dict]:
|
| 47 |
+
"""Detect text bubbles in the comic image."""
|
| 48 |
+
with TextDetector(self.config) as text_detector:
|
| 49 |
+
bubbles_path = text_detector.detect_and_group_text()
|
| 50 |
+
|
| 51 |
+
with open(bubbles_path, "r", encoding="utf-8") as f:
|
| 52 |
+
return json.load(f)
|
| 53 |
+
|
| 54 |
+
def cleanup(self):
|
| 55 |
+
"""Clean up temporary files if needed."""
|
| 56 |
+
# Add cleanup logic here if needed
|
| 57 |
+
pass
|
comic_panel_extractor/panel_extractor.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
from .config import Config
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import cv2
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class PanelData:
|
| 10 |
+
"""Represents an extracted comic panel."""
|
| 11 |
+
x_start: int
|
| 12 |
+
y_start: int
|
| 13 |
+
x_end: int
|
| 14 |
+
y_end: int
|
| 15 |
+
width: int
|
| 16 |
+
height: int
|
| 17 |
+
area: int
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def from_coordinates(cls, x1: int, y1: int, x2: int, y2: int) -> 'PanelData':
|
| 21 |
+
"""Create PanelData from coordinates."""
|
| 22 |
+
return cls(
|
| 23 |
+
x_start=x1,
|
| 24 |
+
y_start=y1,
|
| 25 |
+
x_end=x2,
|
| 26 |
+
y_end=y2,
|
| 27 |
+
width=x2 - x1,
|
| 28 |
+
height=y2 - y1,
|
| 29 |
+
area=(x2 - x1) * (y2 - y1)
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
class PanelExtractor:
|
| 33 |
+
"""Handles comic panel extraction using black percentage analysis."""
|
| 34 |
+
|
| 35 |
+
def __init__(self, config: Config):
|
| 36 |
+
self.config = config
|
| 37 |
+
|
| 38 |
+
def extract_panels(self, dilated_path: str, row_thresh: int = 20, col_thresh: int = 20, min_width_ratio: float = 0.1, min_height_ratio: float = 0.1, min_area_ratio: float = 0.005) -> Tuple[List[np.ndarray], List[PanelData]]:
|
| 39 |
+
"""Extract comic panels using black percentage scan."""
|
| 40 |
+
dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
|
| 41 |
+
original = cv2.imread(self.config.input_path)
|
| 42 |
+
|
| 43 |
+
if dilated is None or original is None:
|
| 44 |
+
raise FileNotFoundError("Could not load dilated or original image")
|
| 45 |
+
|
| 46 |
+
height, width = dilated.shape
|
| 47 |
+
|
| 48 |
+
# Find row gutters and panel rows
|
| 49 |
+
panel_rows = self._find_panel_rows(dilated, row_thresh)
|
| 50 |
+
|
| 51 |
+
# Extract panels from each row
|
| 52 |
+
all_panels = []
|
| 53 |
+
for y1, y2 in panel_rows:
|
| 54 |
+
row_panels = self._extract_panels_from_row(dilated, y1, y2, col_thresh)
|
| 55 |
+
all_panels.extend(row_panels)
|
| 56 |
+
|
| 57 |
+
# Filter panels by size
|
| 58 |
+
filtered_panels = self._filter_panels_by_size(
|
| 59 |
+
all_panels, width, height, min_width_ratio, min_height_ratio, min_area_ratio
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Extract panel images and save
|
| 63 |
+
panel_images, panel_data = self._save_panels(
|
| 64 |
+
filtered_panels, original, width, height
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
return panel_images, panel_data
|
| 68 |
+
|
| 69 |
+
def _find_panel_rows(self, dilated: np.ndarray, row_thresh: int) -> List[Tuple[int, int]]:
|
| 70 |
+
"""Find panel rows by analyzing horizontal black percentages."""
|
| 71 |
+
height, width = dilated.shape
|
| 72 |
+
row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
|
| 73 |
+
|
| 74 |
+
# Find row gutters
|
| 75 |
+
row_gutters = []
|
| 76 |
+
in_gutter = False
|
| 77 |
+
for y, percent_black in enumerate(row_black_percentage):
|
| 78 |
+
if percent_black >= row_thresh and not in_gutter:
|
| 79 |
+
start_row = y
|
| 80 |
+
in_gutter = True
|
| 81 |
+
elif percent_black < row_thresh and in_gutter:
|
| 82 |
+
end_row = y
|
| 83 |
+
row_gutters.append((start_row, end_row))
|
| 84 |
+
in_gutter = False
|
| 85 |
+
|
| 86 |
+
# Convert gutters to panel rows
|
| 87 |
+
panel_rows = []
|
| 88 |
+
prev_end = 0
|
| 89 |
+
for start, end in row_gutters:
|
| 90 |
+
if start - prev_end > 10: # Minimum row height
|
| 91 |
+
panel_rows.append((prev_end, start))
|
| 92 |
+
prev_end = end
|
| 93 |
+
|
| 94 |
+
if height - prev_end > 10:
|
| 95 |
+
panel_rows.append((prev_end, height))
|
| 96 |
+
|
| 97 |
+
return panel_rows
|
| 98 |
+
|
| 99 |
+
def _extract_panels_from_row(self, dilated: np.ndarray, y1: int, y2: int,
|
| 100 |
+
col_thresh: int) -> List[Tuple[int, int, int, int]]:
|
| 101 |
+
"""Extract panels from a single row."""
|
| 102 |
+
width = dilated.shape[1]
|
| 103 |
+
row_slice = dilated[y1:y2, :]
|
| 104 |
+
col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
|
| 105 |
+
|
| 106 |
+
# Find column gutters
|
| 107 |
+
col_gutters = []
|
| 108 |
+
in_gutter = False
|
| 109 |
+
for x, percent_black in enumerate(col_black_percentage):
|
| 110 |
+
if percent_black >= col_thresh and not in_gutter:
|
| 111 |
+
start_col = x
|
| 112 |
+
in_gutter = True
|
| 113 |
+
elif percent_black < col_thresh and in_gutter:
|
| 114 |
+
end_col = x
|
| 115 |
+
col_gutters.append((start_col, end_col))
|
| 116 |
+
in_gutter = False
|
| 117 |
+
|
| 118 |
+
# Convert gutters to panel columns
|
| 119 |
+
panel_cols = []
|
| 120 |
+
prev_end = 0
|
| 121 |
+
for start, end in col_gutters:
|
| 122 |
+
if start - prev_end > 10: # Minimum column width
|
| 123 |
+
panel_cols.append((prev_end, start))
|
| 124 |
+
prev_end = end
|
| 125 |
+
|
| 126 |
+
if width - prev_end > 10:
|
| 127 |
+
panel_cols.append((prev_end, width))
|
| 128 |
+
|
| 129 |
+
return [(x1, y1, x2, y2) for x1, x2 in panel_cols]
|
| 130 |
+
|
| 131 |
+
def _filter_panels_by_size(self, panels: List[Tuple[int, int, int, int]],
|
| 132 |
+
width: int, height: int, min_width_ratio: float,
|
| 133 |
+
min_height_ratio: float, min_area_ratio: float) -> List[Tuple[int, int, int, int]]:
|
| 134 |
+
"""Filter panels by size constraints."""
|
| 135 |
+
# Remove very small panels first
|
| 136 |
+
panels = [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
|
| 137 |
+
if (x2 - x1) * (y2 - y1) >= (width * height) * min_area_ratio]
|
| 138 |
+
|
| 139 |
+
if not panels:
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
# Calculate average dimensions for smart filtering
|
| 143 |
+
panel_widths = [x2 - x1 for x1, _, x2, _ in panels]
|
| 144 |
+
panel_heights = [y2 - y1 for _, y1, _, y2 in panels]
|
| 145 |
+
avg_width = np.mean(panel_widths)
|
| 146 |
+
avg_height = np.mean(panel_heights)
|
| 147 |
+
|
| 148 |
+
min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
|
| 149 |
+
min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
|
| 150 |
+
|
| 151 |
+
return [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
|
| 152 |
+
if (x2 - x1) >= min_allowed_width and (y2 - y1) >= min_allowed_height]
|
| 153 |
+
|
| 154 |
+
def _save_panels(self, panels: List[Tuple[int, int, int, int]],
|
| 155 |
+
original: np.ndarray, width: int, height: int) -> Tuple[List[np.ndarray], List[PanelData]]:
|
| 156 |
+
"""Save panel images and return panel data."""
|
| 157 |
+
visual_output = original.copy()
|
| 158 |
+
panel_images = []
|
| 159 |
+
panel_data = []
|
| 160 |
+
|
| 161 |
+
for idx, (x1, y1, x2, y2) in enumerate(panels, 1):
|
| 162 |
+
# Extract panel image
|
| 163 |
+
panel_img = original[y1:y2, x1:x2]
|
| 164 |
+
panel_images.append(panel_img)
|
| 165 |
+
|
| 166 |
+
# Create panel data
|
| 167 |
+
panel_info = PanelData.from_coordinates(x1, y1, x2, y2)
|
| 168 |
+
panel_data.append(panel_info)
|
| 169 |
+
|
| 170 |
+
# Save panel image
|
| 171 |
+
panel_path = f'{self.config.output_folder}/panel_{idx}.jpg'
|
| 172 |
+
cv2.imwrite(str(panel_path), panel_img)
|
| 173 |
+
|
| 174 |
+
# Draw visualization
|
| 175 |
+
cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 176 |
+
cv2.putText(visual_output, f"#{idx}", (x1+5, y1+25),
|
| 177 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 178 |
+
|
| 179 |
+
# Save visualization
|
| 180 |
+
visual_path = f'{self.config.output_folder}/panels_visualization.jpg'
|
| 181 |
+
cv2.imwrite(str(visual_path), visual_output)
|
| 182 |
+
|
| 183 |
+
print(f"✅ Extracted {len(panels)} panels after filtering.")
|
| 184 |
+
return panel_images, panel_data
|
comic_panel_extractor/text_detector.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from .config import Config
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class TextDetection:
|
| 12 |
+
"""Represents a detected text region."""
|
| 13 |
+
bbox: List[int]
|
| 14 |
+
text: str
|
| 15 |
+
confidence: float
|
| 16 |
+
id: Optional[int] = None
|
| 17 |
+
|
| 18 |
+
class TextDetector:
|
| 19 |
+
"""Handles text detection and grouping from comic images."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, config: Config):
|
| 22 |
+
self.config = config
|
| 23 |
+
self.reader = None
|
| 24 |
+
|
| 25 |
+
def load(self):
|
| 26 |
+
"""Load the OCR reader."""
|
| 27 |
+
if self.reader is None:
|
| 28 |
+
import easyocr
|
| 29 |
+
self.reader = easyocr.Reader(['en'])
|
| 30 |
+
|
| 31 |
+
def detect_text(self) -> List[TextDetection]:
|
| 32 |
+
"""Detect text regions in the image."""
|
| 33 |
+
self.load()
|
| 34 |
+
results = self.reader.readtext(self.config.input_path)
|
| 35 |
+
print(f"EasyOCR found {len(results)} raw detections")
|
| 36 |
+
|
| 37 |
+
detections = []
|
| 38 |
+
for box, text, confidence in results:
|
| 39 |
+
bbox = self._normalize_bbox(box)
|
| 40 |
+
detections.append(TextDetection(
|
| 41 |
+
bbox=bbox,
|
| 42 |
+
text=text.strip(),
|
| 43 |
+
confidence=float(confidence)
|
| 44 |
+
))
|
| 45 |
+
|
| 46 |
+
return detections
|
| 47 |
+
|
| 48 |
+
def _normalize_bbox(self, box: List[List[int]]) -> List[int]:
|
| 49 |
+
"""Convert box coordinates to normalized bbox format."""
|
| 50 |
+
return [
|
| 51 |
+
min(x[0] for x in box),
|
| 52 |
+
min(x[1] for x in box),
|
| 53 |
+
max(x[0] for x in box),
|
| 54 |
+
max(x[1] for x in box)
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
|
| 59 |
+
"""Calculate Euclidean distance between two bounding box centers."""
|
| 60 |
+
center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
|
| 61 |
+
center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
|
| 62 |
+
return np.linalg.norm(np.subtract(center1, center2))
|
| 63 |
+
|
| 64 |
+
def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
|
| 65 |
+
"""Group nearby text regions into speech bubbles."""
|
| 66 |
+
# Filter out single character detections
|
| 67 |
+
filtered_detections = [
|
| 68 |
+
det for det in detections
|
| 69 |
+
if len(det.text.strip()) >= self.config.min_text_length
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
# Sort by vertical position (top to bottom)
|
| 73 |
+
filtered_detections.sort(key=lambda d: d.bbox[1])
|
| 74 |
+
|
| 75 |
+
groups = []
|
| 76 |
+
for detection in filtered_detections:
|
| 77 |
+
merged = False
|
| 78 |
+
|
| 79 |
+
for group in groups:
|
| 80 |
+
if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
|
| 81 |
+
self._merge_detections(group, detection)
|
| 82 |
+
merged = True
|
| 83 |
+
break
|
| 84 |
+
|
| 85 |
+
if not merged:
|
| 86 |
+
groups.append(detection)
|
| 87 |
+
|
| 88 |
+
# Sort groups by vertical position and assign IDs
|
| 89 |
+
groups.sort(key=lambda g: g.bbox[1])
|
| 90 |
+
for idx, group in enumerate(groups):
|
| 91 |
+
group.id = idx + 1
|
| 92 |
+
|
| 93 |
+
return groups
|
| 94 |
+
|
| 95 |
+
def _merge_detections(self, group: TextDetection, detection: TextDetection):
|
| 96 |
+
"""Merge two text detections."""
|
| 97 |
+
group.text += " " + detection.text
|
| 98 |
+
group.bbox = [
|
| 99 |
+
min(group.bbox[0], detection.bbox[0]),
|
| 100 |
+
min(group.bbox[1], detection.bbox[1]),
|
| 101 |
+
max(group.bbox[2], detection.bbox[2]),
|
| 102 |
+
max(group.bbox[3], detection.bbox[3])
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
def detect_and_group_text(self) -> str:
|
| 106 |
+
"""Main method to detect and group text, saving results to JSON."""
|
| 107 |
+
if not os.path.exists(self.config.text_cood_path):
|
| 108 |
+
detections = self.detect_text()
|
| 109 |
+
groups = self.group_text_regions(detections)
|
| 110 |
+
self._save_groups_to_json(groups, self.config.text_cood_path)
|
| 111 |
+
print(f"Grouped bubbles saved: {self.config.text_cood_path}")
|
| 112 |
+
|
| 113 |
+
return self.config.text_cood_path
|
| 114 |
+
|
| 115 |
+
def _save_groups_to_json(self, groups: List[TextDetection], output_path: str):
|
| 116 |
+
"""Save grouped text detections to JSON file."""
|
| 117 |
+
groups_data = []
|
| 118 |
+
for group in groups:
|
| 119 |
+
groups_data.append({
|
| 120 |
+
"id": group.id,
|
| 121 |
+
"bbox": [int(x) for x in group.bbox],
|
| 122 |
+
"text": group.text,
|
| 123 |
+
"confidence": group.confidence
|
| 124 |
+
})
|
| 125 |
+
|
| 126 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 127 |
+
json.dump(groups_data, f, indent=2, ensure_ascii=False)
|
| 128 |
+
|
| 129 |
+
def cleanup(self):
|
| 130 |
+
"""Clean up resources."""
|
| 131 |
+
try:
|
| 132 |
+
if self.reader:
|
| 133 |
+
del self.reader
|
| 134 |
+
except:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
def __enter__(self):
|
| 138 |
+
return self
|
| 139 |
+
|
| 140 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 141 |
+
self.cleanup()
|
| 142 |
+
|
| 143 |
+
def __del__(self):
|
| 144 |
+
self.cleanup()
|
setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
+
long_description = fh.read()
|
| 5 |
+
|
| 6 |
+
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
| 7 |
+
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
|
| 8 |
+
|
| 9 |
+
setup(
|
| 10 |
+
name="comic-panel-extractor",
|
| 11 |
+
version="0.1.0",
|
| 12 |
+
author="Jebin Einstein E",
|
| 13 |
+
author_email="jebineinstein@gmail.com",
|
| 14 |
+
description="A tool for extracting panels from comic book images",
|
| 15 |
+
long_description=long_description,
|
| 16 |
+
long_description_content_type="text/markdown",
|
| 17 |
+
url="https://github.com/jebin2/comic-panel-extractor",
|
| 18 |
+
packages=find_packages(),
|
| 19 |
+
classifiers=[
|
| 20 |
+
"Development Status :: 3 - Alpha",
|
| 21 |
+
"Intended Audience :: Developers",
|
| 22 |
+
"License :: OSI Approved :: MIT License",
|
| 23 |
+
"Operating System :: OS Independent",
|
| 24 |
+
"Programming Language :: Python :: 3",
|
| 25 |
+
"Programming Language :: Python :: 3.8",
|
| 26 |
+
"Programming Language :: Python :: 3.9",
|
| 27 |
+
"Programming Language :: Python :: 3.10",
|
| 28 |
+
"Programming Language :: Python :: 3.11",
|
| 29 |
+
],
|
| 30 |
+
python_requires=">=3.10",
|
| 31 |
+
install_requires=requirements,
|
| 32 |
+
entry_points={
|
| 33 |
+
"console_scripts": [
|
| 34 |
+
"comic-panel-extractor=comic_panel_extractor.cli:main",
|
| 35 |
+
],
|
| 36 |
+
},
|
| 37 |
+
include_package_data=True,
|
| 38 |
+
zip_safe=False,
|
| 39 |
+
)
|
text_detector.py
DELETED
|
@@ -1,148 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from typing import List, Tuple, Optional
|
| 3 |
-
from dataclasses import dataclass
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
import numpy as np
|
| 7 |
-
from moviepy.editor import *
|
| 8 |
-
|
| 9 |
-
@dataclass
|
| 10 |
-
class Config:
|
| 11 |
-
"""Configuration settings for the comic-to-video pipeline."""
|
| 12 |
-
main_file_name: str = ""
|
| 13 |
-
comic_image: str = ""
|
| 14 |
-
temp_folder: str = ""
|
| 15 |
-
distance_threshold: int = 70
|
| 16 |
-
vertical_threshold: int = 30
|
| 17 |
-
tts_engine: str = "chatterbox"
|
| 18 |
-
resolution: Tuple[int, int] = (1920, 1080)
|
| 19 |
-
margin_ratio: float = 0.08
|
| 20 |
-
auto_scroll: bool = True
|
| 21 |
-
zoom_enabled: bool = False
|
| 22 |
-
zoom_factor: float = 1.1
|
| 23 |
-
output_video: str = "comic_text.mp4"
|
| 24 |
-
min_text_length: int = 2
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
@dataclass
|
| 28 |
-
class TextDetection:
|
| 29 |
-
"""Represents a detected text region."""
|
| 30 |
-
bbox: List[int]
|
| 31 |
-
text: str
|
| 32 |
-
confidence: float
|
| 33 |
-
id: Optional[int] = None
|
| 34 |
-
|
| 35 |
-
class TextDetector:
|
| 36 |
-
"""Handles text detection and grouping from comic images."""
|
| 37 |
-
|
| 38 |
-
def __init__(self, config: Config):
|
| 39 |
-
self.config = config
|
| 40 |
-
|
| 41 |
-
def load(self):
|
| 42 |
-
import easyocr
|
| 43 |
-
self.reader = easyocr.Reader(['en'])
|
| 44 |
-
|
| 45 |
-
def detect_text(self, image_path: str) -> List[TextDetection]:
|
| 46 |
-
"""Detect text regions in the image."""
|
| 47 |
-
self.load()
|
| 48 |
-
results = self.reader.readtext(image_path)
|
| 49 |
-
print(f"EasyOCR found {len(results)} raw detections")
|
| 50 |
-
|
| 51 |
-
detections = []
|
| 52 |
-
for box, text, confidence in results:
|
| 53 |
-
bbox = [
|
| 54 |
-
min(x[0] for x in box),
|
| 55 |
-
min(x[1] for x in box),
|
| 56 |
-
max(x[0] for x in box),
|
| 57 |
-
max(x[1] for x in box)
|
| 58 |
-
]
|
| 59 |
-
detections.append(TextDetection(
|
| 60 |
-
bbox=bbox,
|
| 61 |
-
text=text.strip(),
|
| 62 |
-
confidence=float(confidence)
|
| 63 |
-
))
|
| 64 |
-
|
| 65 |
-
return detections
|
| 66 |
-
|
| 67 |
-
@staticmethod
|
| 68 |
-
def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
|
| 69 |
-
"""Calculate Euclidean distance between two bounding box centers."""
|
| 70 |
-
center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
|
| 71 |
-
center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
|
| 72 |
-
return np.linalg.norm(np.subtract(center1, center2))
|
| 73 |
-
|
| 74 |
-
def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
|
| 75 |
-
"""Group nearby text regions into speech bubbles."""
|
| 76 |
-
# Filter out single character detections
|
| 77 |
-
filtered_detections = [
|
| 78 |
-
det for det in detections
|
| 79 |
-
if len(det.text.strip()) >= self.config.min_text_length
|
| 80 |
-
]
|
| 81 |
-
|
| 82 |
-
# Sort by vertical position (top to bottom)
|
| 83 |
-
filtered_detections.sort(key=lambda d: d.bbox[1])
|
| 84 |
-
|
| 85 |
-
groups = []
|
| 86 |
-
for detection in filtered_detections:
|
| 87 |
-
added_to_group = False
|
| 88 |
-
|
| 89 |
-
for group in groups:
|
| 90 |
-
if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
|
| 91 |
-
# Merge with existing group
|
| 92 |
-
group.text += " " + detection.text
|
| 93 |
-
group.bbox = [
|
| 94 |
-
min(group.bbox[0], detection.bbox[0]),
|
| 95 |
-
min(group.bbox[1], detection.bbox[1]),
|
| 96 |
-
max(group.bbox[2], detection.bbox[2]),
|
| 97 |
-
max(group.bbox[3], detection.bbox[3])
|
| 98 |
-
]
|
| 99 |
-
added_to_group = True
|
| 100 |
-
break
|
| 101 |
-
|
| 102 |
-
if not added_to_group:
|
| 103 |
-
groups.append(detection)
|
| 104 |
-
|
| 105 |
-
# Sort groups by vertical position and assign IDs
|
| 106 |
-
groups.sort(key=lambda g: g.bbox[1])
|
| 107 |
-
for idx, group in enumerate(groups):
|
| 108 |
-
group.id = idx + 1
|
| 109 |
-
|
| 110 |
-
return groups
|
| 111 |
-
|
| 112 |
-
def detect_and_group_text(self, image_path: str) -> str:
|
| 113 |
-
"""Main method to detect and group text, saving results to JSON."""
|
| 114 |
-
|
| 115 |
-
# Save to JSON
|
| 116 |
-
output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
|
| 117 |
-
if not os.path.exists(output_path):
|
| 118 |
-
detections = self.detect_text(image_path)
|
| 119 |
-
groups = self.group_text_regions(detections)
|
| 120 |
-
groups_data = []
|
| 121 |
-
for group in groups:
|
| 122 |
-
groups_data.append({
|
| 123 |
-
"id": group.id,
|
| 124 |
-
"bbox": [int(x) for x in group.bbox],
|
| 125 |
-
"text": group.text,
|
| 126 |
-
"confidence": group.confidence
|
| 127 |
-
})
|
| 128 |
-
|
| 129 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
| 130 |
-
json.dump(groups_data, f, indent=2, ensure_ascii=False)
|
| 131 |
-
|
| 132 |
-
print(f"Grouped bubbles saved: {output_path}")
|
| 133 |
-
return str(output_path)
|
| 134 |
-
|
| 135 |
-
def cleanup(self):
|
| 136 |
-
try:
|
| 137 |
-
del self.reader
|
| 138 |
-
|
| 139 |
-
except: pass
|
| 140 |
-
|
| 141 |
-
def __enter__(self):
|
| 142 |
-
return self
|
| 143 |
-
|
| 144 |
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 145 |
-
self.cleanup()
|
| 146 |
-
|
| 147 |
-
def __del__(self):
|
| 148 |
-
self.cleanup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|