jebin2 commited on
Commit
a7da787
·
1 Parent(s): 5327cbd

test workflow

Browse files
Files changed (3) hide show
  1. ComicPanelExtractor.py +239 -0
  2. requirements.txt +4 -0
  3. text_detector.py +148 -0
ComicPanelExtractor.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import json
4
+ from text_detector import TextDetector, Config as CVP_Config
5
+ import cv2
6
+ import shutil
7
+
8
+ # ----------------------------------------------------------
9
+ # MASK TEXT REGIONS
10
+ # ----------------------------------------------------------
11
+
12
+ def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
13
+ """
14
+ Make the text regions in an image white (or given color) to reduce panel extraction noise.
15
+
16
+ Args:
17
+ image_path (str): Path to the input image.
18
+ bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
19
+ output_path (str, optional): Path to save the modified image.
20
+ color (tuple): Color to fill the bounding boxes (default black).
21
+ Returns:
22
+ masked_image (numpy array): Image with masked text regions.
23
+ """
24
+ image = cv2.imread(image_path)
25
+ if image is None:
26
+ raise Exception(f"Could not load image: {image_path}")
27
+
28
+ for bbox in bboxes:
29
+ x1, y1, x2, y2 = bbox
30
+ cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1) # Fill rectangle
31
+
32
+ if output_path:
33
+ cv2.imwrite(output_path, image)
34
+ print(f"✅ Text-masked image saved to: {output_path}")
35
+
36
+ return image
37
+
38
+
39
+ # ----------------------------------------------------------
40
+ # PRE PROCESS METHOD
41
+ # ----------------------------------------------------------
42
+
43
+ def pre_process(image_path, output_dir):
44
+ if not os.path.exists(output_dir):
45
+ os.makedirs(output_dir)
46
+
47
+ # Load and preprocess image
48
+ image = cv2.imread(image_path)
49
+ if image is None:
50
+ raise Exception(f"Could not load image: {image_path}")
51
+
52
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
53
+ _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
54
+
55
+ # Dilate to strengthen borders and fill small gaps
56
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
57
+ dilated = cv2.dilate(binary, kernel, iterations=2)
58
+
59
+ cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
60
+ cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
61
+ cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
62
+
63
+
64
+ # ----------------------------------------------------------
65
+ # CLEAN DILATED IMAGE
66
+ # ----------------------------------------------------------
67
+
68
+ def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
69
+ """
70
+ Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
71
+ with preference to clean rows that have fewer black pixels.
72
+ """
73
+ dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
74
+ if dilated is None:
75
+ raise Exception("Could not load dilated image.")
76
+
77
+ binary = (dilated == 0).astype(np.uint8)
78
+ padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
79
+ cleaned = binary.copy()
80
+
81
+ height, width = binary.shape
82
+ row_black_counts = np.sum(binary, axis=1)
83
+
84
+ for y in range(1, height + 1):
85
+ for x in range(1, width + 1):
86
+ if padded[y, x] == 1:
87
+ neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
88
+ if neighbors > max_neighbors:
89
+ neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
90
+ if neighbor_rows:
91
+ row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
92
+ if y == row_to_clear:
93
+ cleaned[y-1, x-1] = 0
94
+
95
+ cleaned_img = (1 - cleaned) * 255
96
+ cv2.imwrite(output_path, cleaned_img)
97
+ print(f"✅ Cleaned dilated image saved to: {output_path}")
98
+ return output_path
99
+
100
+
101
+ # ----------------------------------------------------------
102
+ # EXTRACT PANELS - BLACK PERCENTAGE METHOD
103
+ # ----------------------------------------------------------
104
+
105
+ def extract_panels_by_black_percentage_fixed(
106
+ dilated_path, original_image_path, output_dir,
107
+ row_thresh=20, col_thresh=20,
108
+ min_width_ratio=0.1, min_height_ratio=0.1
109
+ ):
110
+ """
111
+ Extract comic panels using black percentage scan with smart width & height filtering.
112
+ """
113
+ if not os.path.exists(output_dir):
114
+ os.makedirs(output_dir)
115
+
116
+ dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
117
+ original = cv2.imread(original_image_path)
118
+ if dilated is None or original is None:
119
+ raise Exception("Could not load dilated or original image.")
120
+
121
+ height, width = dilated.shape
122
+ visual_output = original.copy()
123
+
124
+ # Detect row gutters
125
+ row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
126
+ row_gutters, panel_rows = [], []
127
+ in_gutter = False
128
+ for y, percent_black in enumerate(row_black_percentage):
129
+ if percent_black >= row_thresh and not in_gutter:
130
+ start_row = y
131
+ in_gutter = True
132
+ elif percent_black < row_thresh and in_gutter:
133
+ end_row = y
134
+ row_gutters.append((start_row, end_row))
135
+ in_gutter = False
136
+
137
+ prev_end = 0
138
+ for start, end in row_gutters:
139
+ if start - prev_end > 10:
140
+ panel_rows.append((prev_end, start))
141
+ prev_end = end
142
+ if height - prev_end > 10:
143
+ panel_rows.append((prev_end, height))
144
+
145
+ # Extract panels
146
+ all_panels, panel_count, panel_images, panel_points = [], 0, [], []
147
+ for y1, y2 in panel_rows:
148
+ row_slice = dilated[y1:y2, :]
149
+ col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
150
+ col_gutters, panel_cols = [], []
151
+ in_gutter_col = False
152
+ for x, percent_black in enumerate(col_black_percentage):
153
+ if percent_black >= col_thresh and not in_gutter_col:
154
+ start_col = x
155
+ in_gutter_col = True
156
+ elif percent_black < col_thresh and in_gutter_col:
157
+ end_col = x
158
+ col_gutters.append((start_col, end_col))
159
+ in_gutter_col = False
160
+
161
+ prev_end_col = 0
162
+ for start, end in col_gutters:
163
+ if start - prev_end_col > 10:
164
+ panel_cols.append((prev_end_col, start))
165
+ prev_end_col = end
166
+ if width - prev_end_col > 10:
167
+ panel_cols.append((prev_end_col, width))
168
+
169
+ for x1, x2 in panel_cols:
170
+ w, h = x2 - x1, y2 - y1
171
+ if w * h < (width * height) * 0.005:
172
+ continue
173
+ all_panels.append((x1, y1, x2, y2))
174
+
175
+ # Post-filter
176
+ panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
177
+ panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
178
+ avg_width = np.mean(panel_widths) if panel_widths else 0
179
+ avg_height = np.mean(panel_heights) if panel_heights else 0
180
+ min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
181
+ min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
182
+
183
+ for x1, y1, x2, y2 in all_panels:
184
+ panel_width, panel_height = x2 - x1, y2 - y1
185
+ if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
186
+ panel = original[y1:y2, x1:x2]
187
+ panel_count += 1
188
+ panel_images.append(panel)
189
+ panel_points.append({
190
+ "x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
191
+ })
192
+ panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
193
+ cv2.imwrite(panel_path, panel)
194
+ cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
195
+ cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
196
+ cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
197
+
198
+ print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
199
+ return output_dir, panel_images, panel_points
200
+
201
+
202
+ # ----------------------------------------------------------
203
+ # MAIN EXECUTION
204
+ # ----------------------------------------------------------
205
+ if __name__ == "__main__":
206
+ image_path = "input.jpg"
207
+ output_dir = "extracted_panels"
208
+ shutil.rmtree(output_dir, ignore_errors=True)
209
+ os.makedirs(output_dir, exist_ok=True)
210
+
211
+ # Detect and mask text regions
212
+ cvp_config = CVP_Config()
213
+ cvp_config.main_file_name = image_path
214
+ cvp_config.temp_folder = output_dir
215
+ cvp_config.comic_image = image_path
216
+ cvp_config.output_video = f"{output_dir}/test.mp4"
217
+
218
+ with TextDetector(cvp_config) as text_detector:
219
+ bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
220
+ with open(bubbles_path, "r", encoding="utf-8") as f:
221
+ bubbles = json.load(f)
222
+
223
+ output_path = os.path.join(output_dir, "1_text_removed.jpg")
224
+ masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
225
+
226
+ pre_process(output_path, output_dir)
227
+
228
+ # Clean dilated image
229
+ dilated_path = os.path.join(output_dir, "4_dilated.jpg")
230
+ cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
231
+ clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
232
+
233
+ # Extract panels - black percentage
234
+ extract_panels_by_black_percentage_fixed(
235
+ cleaned_dilated_path,
236
+ image_path,
237
+ output_dir,
238
+ min_width_ratio=0.1, # Panels must be at least 10% of total width
239
+ )
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ moviepy==1.0.3
2
+ numpy
3
+ opencv-python
4
+ easyocr
text_detector.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Tuple, Optional
3
+ from dataclasses import dataclass
4
+ import os
5
+
6
+ import numpy as np
7
+ from moviepy.editor import *
8
+
9
+ @dataclass
10
+ class Config:
11
+ """Configuration settings for the comic-to-video pipeline."""
12
+ main_file_name: str = ""
13
+ comic_image: str = ""
14
+ temp_folder: str = ""
15
+ distance_threshold: int = 70
16
+ vertical_threshold: int = 30
17
+ tts_engine: str = "chatterbox"
18
+ resolution: Tuple[int, int] = (1920, 1080)
19
+ margin_ratio: float = 0.08
20
+ auto_scroll: bool = True
21
+ zoom_enabled: bool = False
22
+ zoom_factor: float = 1.1
23
+ output_video: str = "comic_text.mp4"
24
+ min_text_length: int = 2
25
+
26
+
27
+ @dataclass
28
+ class TextDetection:
29
+ """Represents a detected text region."""
30
+ bbox: List[int]
31
+ text: str
32
+ confidence: float
33
+ id: Optional[int] = None
34
+
35
+ class TextDetector:
36
+ """Handles text detection and grouping from comic images."""
37
+
38
+ def __init__(self, config: Config):
39
+ self.config = config
40
+
41
+ def load(self):
42
+ import easyocr
43
+ self.reader = easyocr.Reader(['en'])
44
+
45
+ def detect_text(self, image_path: str) -> List[TextDetection]:
46
+ """Detect text regions in the image."""
47
+ self.load()
48
+ results = self.reader.readtext(image_path)
49
+ print(f"EasyOCR found {len(results)} raw detections")
50
+
51
+ detections = []
52
+ for box, text, confidence in results:
53
+ bbox = [
54
+ min(x[0] for x in box),
55
+ min(x[1] for x in box),
56
+ max(x[0] for x in box),
57
+ max(x[1] for x in box)
58
+ ]
59
+ detections.append(TextDetection(
60
+ bbox=bbox,
61
+ text=text.strip(),
62
+ confidence=float(confidence)
63
+ ))
64
+
65
+ return detections
66
+
67
+ @staticmethod
68
+ def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
69
+ """Calculate Euclidean distance between two bounding box centers."""
70
+ center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
71
+ center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
72
+ return np.linalg.norm(np.subtract(center1, center2))
73
+
74
+ def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
75
+ """Group nearby text regions into speech bubbles."""
76
+ # Filter out single character detections
77
+ filtered_detections = [
78
+ det for det in detections
79
+ if len(det.text.strip()) >= self.config.min_text_length
80
+ ]
81
+
82
+ # Sort by vertical position (top to bottom)
83
+ filtered_detections.sort(key=lambda d: d.bbox[1])
84
+
85
+ groups = []
86
+ for detection in filtered_detections:
87
+ added_to_group = False
88
+
89
+ for group in groups:
90
+ if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
91
+ # Merge with existing group
92
+ group.text += " " + detection.text
93
+ group.bbox = [
94
+ min(group.bbox[0], detection.bbox[0]),
95
+ min(group.bbox[1], detection.bbox[1]),
96
+ max(group.bbox[2], detection.bbox[2]),
97
+ max(group.bbox[3], detection.bbox[3])
98
+ ]
99
+ added_to_group = True
100
+ break
101
+
102
+ if not added_to_group:
103
+ groups.append(detection)
104
+
105
+ # Sort groups by vertical position and assign IDs
106
+ groups.sort(key=lambda g: g.bbox[1])
107
+ for idx, group in enumerate(groups):
108
+ group.id = idx + 1
109
+
110
+ return groups
111
+
112
+ def detect_and_group_text(self, image_path: str) -> str:
113
+ """Main method to detect and group text, saving results to JSON."""
114
+
115
+ # Save to JSON
116
+ output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
117
+ if not os.path.exists(output_path):
118
+ detections = self.detect_text(image_path)
119
+ groups = self.group_text_regions(detections)
120
+ groups_data = []
121
+ for group in groups:
122
+ groups_data.append({
123
+ "id": group.id,
124
+ "bbox": [int(x) for x in group.bbox],
125
+ "text": group.text,
126
+ "confidence": group.confidence
127
+ })
128
+
129
+ with open(output_path, "w", encoding="utf-8") as f:
130
+ json.dump(groups_data, f, indent=2, ensure_ascii=False)
131
+
132
+ print(f"Grouped bubbles saved: {output_path}")
133
+ return str(output_path)
134
+
135
+ def cleanup(self):
136
+ try:
137
+ del self.reader
138
+
139
+ except: pass
140
+
141
+ def __enter__(self):
142
+ return self
143
+
144
+ def __exit__(self, exc_type, exc_val, exc_tb):
145
+ self.cleanup()
146
+
147
+ def __del__(self):
148
+ self.cleanup()