jebin2 commited on
Commit
ec6ad2f
·
1 Parent(s): a7da787
.gitignore CHANGED
@@ -205,3 +205,4 @@ cython_debug/
205
  marimo/_static/
206
  marimo/_lsp/
207
  __marimo__/
 
 
205
  marimo/_static/
206
  marimo/_lsp/
207
  __marimo__/
208
+ temp_dir
ComicPanelExtractor.py DELETED
@@ -1,239 +0,0 @@
1
- import numpy as np
2
- import os
3
- import json
4
- from text_detector import TextDetector, Config as CVP_Config
5
- import cv2
6
- import shutil
7
-
8
- # ----------------------------------------------------------
9
- # MASK TEXT REGIONS
10
- # ----------------------------------------------------------
11
-
12
- def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
13
- """
14
- Make the text regions in an image white (or given color) to reduce panel extraction noise.
15
-
16
- Args:
17
- image_path (str): Path to the input image.
18
- bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
19
- output_path (str, optional): Path to save the modified image.
20
- color (tuple): Color to fill the bounding boxes (default black).
21
- Returns:
22
- masked_image (numpy array): Image with masked text regions.
23
- """
24
- image = cv2.imread(image_path)
25
- if image is None:
26
- raise Exception(f"Could not load image: {image_path}")
27
-
28
- for bbox in bboxes:
29
- x1, y1, x2, y2 = bbox
30
- cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1) # Fill rectangle
31
-
32
- if output_path:
33
- cv2.imwrite(output_path, image)
34
- print(f"✅ Text-masked image saved to: {output_path}")
35
-
36
- return image
37
-
38
-
39
- # ----------------------------------------------------------
40
- # PRE PROCESS METHOD
41
- # ----------------------------------------------------------
42
-
43
- def pre_process(image_path, output_dir):
44
- if not os.path.exists(output_dir):
45
- os.makedirs(output_dir)
46
-
47
- # Load and preprocess image
48
- image = cv2.imread(image_path)
49
- if image is None:
50
- raise Exception(f"Could not load image: {image_path}")
51
-
52
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
53
- _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
54
-
55
- # Dilate to strengthen borders and fill small gaps
56
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
57
- dilated = cv2.dilate(binary, kernel, iterations=2)
58
-
59
- cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
60
- cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
61
- cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
62
-
63
-
64
- # ----------------------------------------------------------
65
- # CLEAN DILATED IMAGE
66
- # ----------------------------------------------------------
67
-
68
- def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
69
- """
70
- Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
71
- with preference to clean rows that have fewer black pixels.
72
- """
73
- dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
74
- if dilated is None:
75
- raise Exception("Could not load dilated image.")
76
-
77
- binary = (dilated == 0).astype(np.uint8)
78
- padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
79
- cleaned = binary.copy()
80
-
81
- height, width = binary.shape
82
- row_black_counts = np.sum(binary, axis=1)
83
-
84
- for y in range(1, height + 1):
85
- for x in range(1, width + 1):
86
- if padded[y, x] == 1:
87
- neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
88
- if neighbors > max_neighbors:
89
- neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
90
- if neighbor_rows:
91
- row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
92
- if y == row_to_clear:
93
- cleaned[y-1, x-1] = 0
94
-
95
- cleaned_img = (1 - cleaned) * 255
96
- cv2.imwrite(output_path, cleaned_img)
97
- print(f"✅ Cleaned dilated image saved to: {output_path}")
98
- return output_path
99
-
100
-
101
- # ----------------------------------------------------------
102
- # EXTRACT PANELS - BLACK PERCENTAGE METHOD
103
- # ----------------------------------------------------------
104
-
105
- def extract_panels_by_black_percentage_fixed(
106
- dilated_path, original_image_path, output_dir,
107
- row_thresh=20, col_thresh=20,
108
- min_width_ratio=0.1, min_height_ratio=0.1
109
- ):
110
- """
111
- Extract comic panels using black percentage scan with smart width & height filtering.
112
- """
113
- if not os.path.exists(output_dir):
114
- os.makedirs(output_dir)
115
-
116
- dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
117
- original = cv2.imread(original_image_path)
118
- if dilated is None or original is None:
119
- raise Exception("Could not load dilated or original image.")
120
-
121
- height, width = dilated.shape
122
- visual_output = original.copy()
123
-
124
- # Detect row gutters
125
- row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
126
- row_gutters, panel_rows = [], []
127
- in_gutter = False
128
- for y, percent_black in enumerate(row_black_percentage):
129
- if percent_black >= row_thresh and not in_gutter:
130
- start_row = y
131
- in_gutter = True
132
- elif percent_black < row_thresh and in_gutter:
133
- end_row = y
134
- row_gutters.append((start_row, end_row))
135
- in_gutter = False
136
-
137
- prev_end = 0
138
- for start, end in row_gutters:
139
- if start - prev_end > 10:
140
- panel_rows.append((prev_end, start))
141
- prev_end = end
142
- if height - prev_end > 10:
143
- panel_rows.append((prev_end, height))
144
-
145
- # Extract panels
146
- all_panels, panel_count, panel_images, panel_points = [], 0, [], []
147
- for y1, y2 in panel_rows:
148
- row_slice = dilated[y1:y2, :]
149
- col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
150
- col_gutters, panel_cols = [], []
151
- in_gutter_col = False
152
- for x, percent_black in enumerate(col_black_percentage):
153
- if percent_black >= col_thresh and not in_gutter_col:
154
- start_col = x
155
- in_gutter_col = True
156
- elif percent_black < col_thresh and in_gutter_col:
157
- end_col = x
158
- col_gutters.append((start_col, end_col))
159
- in_gutter_col = False
160
-
161
- prev_end_col = 0
162
- for start, end in col_gutters:
163
- if start - prev_end_col > 10:
164
- panel_cols.append((prev_end_col, start))
165
- prev_end_col = end
166
- if width - prev_end_col > 10:
167
- panel_cols.append((prev_end_col, width))
168
-
169
- for x1, x2 in panel_cols:
170
- w, h = x2 - x1, y2 - y1
171
- if w * h < (width * height) * 0.005:
172
- continue
173
- all_panels.append((x1, y1, x2, y2))
174
-
175
- # Post-filter
176
- panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
177
- panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
178
- avg_width = np.mean(panel_widths) if panel_widths else 0
179
- avg_height = np.mean(panel_heights) if panel_heights else 0
180
- min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
181
- min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
182
-
183
- for x1, y1, x2, y2 in all_panels:
184
- panel_width, panel_height = x2 - x1, y2 - y1
185
- if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
186
- panel = original[y1:y2, x1:x2]
187
- panel_count += 1
188
- panel_images.append(panel)
189
- panel_points.append({
190
- "x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
191
- })
192
- panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
193
- cv2.imwrite(panel_path, panel)
194
- cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
195
- cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
196
- cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
197
-
198
- print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
199
- return output_dir, panel_images, panel_points
200
-
201
-
202
- # ----------------------------------------------------------
203
- # MAIN EXECUTION
204
- # ----------------------------------------------------------
205
- if __name__ == "__main__":
206
- image_path = "input.jpg"
207
- output_dir = "extracted_panels"
208
- shutil.rmtree(output_dir, ignore_errors=True)
209
- os.makedirs(output_dir, exist_ok=True)
210
-
211
- # Detect and mask text regions
212
- cvp_config = CVP_Config()
213
- cvp_config.main_file_name = image_path
214
- cvp_config.temp_folder = output_dir
215
- cvp_config.comic_image = image_path
216
- cvp_config.output_video = f"{output_dir}/test.mp4"
217
-
218
- with TextDetector(cvp_config) as text_detector:
219
- bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
220
- with open(bubbles_path, "r", encoding="utf-8") as f:
221
- bubbles = json.load(f)
222
-
223
- output_path = os.path.join(output_dir, "1_text_removed.jpg")
224
- masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
225
-
226
- pre_process(output_path, output_dir)
227
-
228
- # Clean dilated image
229
- dilated_path = os.path.join(output_dir, "4_dilated.jpg")
230
- cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
231
- clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
232
-
233
- # Extract panels - black percentage
234
- extract_panels_by_black_percentage_fixed(
235
- cleaned_dilated_path,
236
- image_path,
237
- output_dir,
238
- min_width_ratio=0.1, # Panels must be at least 10% of total width
239
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
comic_panel_extractor/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .main import ComicPanelExtractor
2
+ from .config import Config
3
+ from .text_detector import TextDetector, TextDetection
4
+ from .image_processor import ImageProcessor
5
+ from .panel_extractor import PanelExtractor, PanelData
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = [
9
+ "ComicPanelExtractor",
10
+ "Config",
11
+ "TextDetector",
12
+ "TextDetection",
13
+ "ImageProcessor",
14
+ "PanelExtractor",
15
+ "PanelData"
16
+ ]
comic_panel_extractor/cli.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for comic panel extraction.
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+ import json
9
+ from typing import Optional, List
10
+
11
+ from .main import ComicPanelExtractor
12
+ from .config import Config
13
+
14
+
15
+ class ComicPanelCLI:
16
+ """Command-line interface for comic panel extraction."""
17
+
18
+ def __init__(self):
19
+ self.parser = self._create_parser()
20
+
21
+ def _create_parser(self) -> argparse.ArgumentParser:
22
+ """Create argument parser."""
23
+ parser = argparse.ArgumentParser(
24
+ prog="comic-extract",
25
+ description="Extract panels from comic book images using OCR and image processing",
26
+ formatter_class=argparse.RawDescriptionHelpFormatter,
27
+ epilog="""
28
+ Examples:
29
+ comic-extract comic.jpg
30
+ comic-extract comic.jpg --config config.json
31
+ """
32
+ )
33
+
34
+ # Required arguments
35
+ parser.add_argument(
36
+ "input_path",
37
+ help="Path to the comic image file"
38
+ )
39
+
40
+ # Configuration file
41
+ parser.add_argument(
42
+ "--config",
43
+ help="Path to JSON configuration file"
44
+ )
45
+
46
+ return parser
47
+
48
+ def run(self, args: Optional[List[str]] = None) -> int:
49
+ """Main CLI entry point."""
50
+ try:
51
+ parsed_args = self.parser.parse_args(args)
52
+ # Load configuration
53
+ config = self._load_config(parsed_args)
54
+ ComicPanelExtractor(config).extract_panels_from_comic()
55
+ except Exception as e:
56
+ print(f"❌ Error: {e}", file=sys.stderr)
57
+ return 1
58
+
59
+ def _load_config(self, args: argparse.Namespace) -> Config:
60
+ """Load configuration from file or create from arguments."""
61
+ config = Config()
62
+
63
+ # Load from config file if provided
64
+ if args.config:
65
+ try:
66
+ with open(args.config, 'r', encoding='utf-8') as f:
67
+ config_data = json.load(f)
68
+ for key, value in config_data.items():
69
+ if hasattr(config, key):
70
+ setattr(config, key, value)
71
+ if args.verbose:
72
+ print(f"📄 Loaded configuration from: {args.config}")
73
+ except Exception as e:
74
+ print(f"⚠️ Warning: Could not load config file: {e}", file=sys.stderr)
75
+
76
+ # Override with command line arguments
77
+ config.input_path = args.input_path
78
+
79
+ return config
80
+
81
+ def main():
82
+ """Main entry point for CLI."""
83
+ cli = ComicPanelCLI()
84
+ sys.exit(cli.run())
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
comic_panel_extractor/config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class Config:
5
+ """Configuration settings for the comic-to-video pipeline."""
6
+ input_path: str = ""
7
+ output_folder: str = "temp_dir"
8
+ distance_threshold: int = 70
9
+ vertical_threshold: int = 30
10
+ text_cood_path: str = f"{output_folder}/detect_and_group_text.json"
11
+ min_text_length: int = 2
comic_panel_extractor/image_processor.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from pathlib import Path
3
+ from .config import Config
4
+
5
+ import numpy as np
6
+ import cv2
7
+
8
+ class ImageProcessor:
9
+ """Handles image preprocessing operations."""
10
+
11
+ def __init__(self, config: Config):
12
+ self.config = config
13
+
14
+ def mask_text_regions(self, bboxes: List[List[int]], output_filename: str = "1_text_removed.jpg", color: Tuple[int, int, int] = (0, 0, 0)) -> str:
15
+ """Mask text regions in the image to reduce panel extraction noise."""
16
+ image = cv2.imread(self.config.input_path)
17
+ if image is None:
18
+ raise FileNotFoundError(f"Could not load image: {self.config.input_path}")
19
+
20
+ for bbox in bboxes:
21
+ x1, y1, x2, y2 = bbox
22
+ cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1)
23
+
24
+ output_path = f'{self.config.output_folder}/{output_filename}'
25
+ cv2.imwrite(output_path, image)
26
+ print(f"✅ Text-masked image saved to: {output_path}")
27
+ return str(output_path)
28
+
29
+ def preprocess_image(self, masked_image_path) -> Tuple[str, str, str]:
30
+ """Preprocess image for panel extraction."""
31
+ image = cv2.imread(masked_image_path)
32
+ if image is None:
33
+ raise FileNotFoundError(f"Could not load image: {masked_image_path}")
34
+
35
+ # Convert to grayscale and binary
36
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
37
+ _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
38
+
39
+ # Dilate to strengthen borders
40
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
41
+ dilated = cv2.dilate(binary, kernel, iterations=2)
42
+
43
+ # Save intermediate results
44
+ gray_path = f'{self.config.output_folder}/2_gray.jpg'
45
+ binary_path = f'{self.config.output_folder}/3_binary.jpg'
46
+ dilated_path = f'{self.config.output_folder}/4_dilated.jpg'
47
+
48
+ cv2.imwrite(str(gray_path), gray)
49
+ cv2.imwrite(str(binary_path), binary)
50
+ cv2.imwrite(str(dilated_path), dilated)
51
+
52
+ return str(gray_path), str(binary_path), str(dilated_path)
53
+
54
+ def clean_dilated_image(self, dilated_path: str,
55
+ output_filename: str = "5_dilated_cleaned.jpg",
56
+ max_neighbors: int = 2) -> str:
57
+ """Clean dilated image by thinning thick borders."""
58
+ dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
59
+ if dilated is None:
60
+ raise FileNotFoundError(f"Could not load dilated image: {dilated_path}")
61
+
62
+ binary = (dilated == 0).astype(np.uint8)
63
+ padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
64
+ cleaned = binary.copy()
65
+
66
+ height, width = binary.shape
67
+ row_black_counts = np.sum(binary, axis=1)
68
+
69
+ for y in range(1, height + 1):
70
+ for x in range(1, width + 1):
71
+ if padded[y, x] == 1:
72
+ neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
73
+ if neighbors > max_neighbors:
74
+ neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
75
+ if neighbor_rows:
76
+ row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
77
+ if y == row_to_clear:
78
+ cleaned[y-1, x-1] = 0
79
+
80
+ cleaned_img = (1 - cleaned) * 255
81
+ output_path = f'{self.config.output_folder}/{output_filename}'
82
+ cv2.imwrite(str(output_path), cleaned_img)
83
+ print(f"✅ Cleaned dilated image saved to: {output_path}")
84
+ return str(output_path)
comic_panel_extractor/main.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .text_detector import TextDetector
2
+ from .config import Config
3
+ from .image_processor import ImageProcessor
4
+ from .panel_extractor import PanelData
5
+ from .panel_extractor import PanelExtractor
6
+
7
+ from typing import List, Tuple
8
+ from pathlib import Path
9
+ import numpy as np
10
+ import json
11
+ import shutil
12
+
13
+ class ComicPanelExtractor:
14
+ """Main class that orchestrates the comic panel extraction process."""
15
+
16
+ def __init__(self, config: Config):
17
+ self.config = config
18
+ if Path(self.config.output_folder).exists():
19
+ shutil.rmtree(self.config.output_folder)
20
+ Path(self.config.output_folder).mkdir(exist_ok=True)
21
+
22
+ self.image_processor = ImageProcessor(self.config)
23
+ self.panel_extractor = PanelExtractor(self.config)
24
+
25
+ def extract_panels_from_comic(self) -> Tuple[List[np.ndarray], List[PanelData]]:
26
+ """Complete pipeline to extract panels from a comic image."""
27
+ print(f"Starting panel extraction for: {self.config.input_path}")
28
+
29
+ # Step 1: Detect and mask text regions
30
+ text_bubbles = self._detect_text_bubbles()
31
+ masked_image_path = self.image_processor.mask_text_regions([bubble["bbox"] for bubble in text_bubbles])
32
+
33
+ # Step 2: Preprocess image
34
+ _, _, dilated_path = self.image_processor.preprocess_image(masked_image_path)
35
+
36
+ # Step 3: Clean dilated image
37
+ cleaned_path = self.image_processor.clean_dilated_image(dilated_path)
38
+
39
+ # Step 4: Extract panels
40
+ panel_images, panel_data = self.panel_extractor.extract_panels(
41
+ cleaned_path, min_width_ratio=0.1
42
+ )
43
+
44
+ return panel_images, panel_data
45
+
46
+ def _detect_text_bubbles(self) -> List[dict]:
47
+ """Detect text bubbles in the comic image."""
48
+ with TextDetector(self.config) as text_detector:
49
+ bubbles_path = text_detector.detect_and_group_text()
50
+
51
+ with open(bubbles_path, "r", encoding="utf-8") as f:
52
+ return json.load(f)
53
+
54
+ def cleanup(self):
55
+ """Clean up temporary files if needed."""
56
+ # Add cleanup logic here if needed
57
+ pass
comic_panel_extractor/panel_extractor.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from .config import Config
3
+
4
+ import numpy as np
5
+ import cv2
6
+ from dataclasses import dataclass
7
+
8
+ @dataclass
9
+ class PanelData:
10
+ """Represents an extracted comic panel."""
11
+ x_start: int
12
+ y_start: int
13
+ x_end: int
14
+ y_end: int
15
+ width: int
16
+ height: int
17
+ area: int
18
+
19
+ @classmethod
20
+ def from_coordinates(cls, x1: int, y1: int, x2: int, y2: int) -> 'PanelData':
21
+ """Create PanelData from coordinates."""
22
+ return cls(
23
+ x_start=x1,
24
+ y_start=y1,
25
+ x_end=x2,
26
+ y_end=y2,
27
+ width=x2 - x1,
28
+ height=y2 - y1,
29
+ area=(x2 - x1) * (y2 - y1)
30
+ )
31
+
32
+ class PanelExtractor:
33
+ """Handles comic panel extraction using black percentage analysis."""
34
+
35
+ def __init__(self, config: Config):
36
+ self.config = config
37
+
38
+ def extract_panels(self, dilated_path: str, row_thresh: int = 20, col_thresh: int = 20, min_width_ratio: float = 0.1, min_height_ratio: float = 0.1, min_area_ratio: float = 0.005) -> Tuple[List[np.ndarray], List[PanelData]]:
39
+ """Extract comic panels using black percentage scan."""
40
+ dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
41
+ original = cv2.imread(self.config.input_path)
42
+
43
+ if dilated is None or original is None:
44
+ raise FileNotFoundError("Could not load dilated or original image")
45
+
46
+ height, width = dilated.shape
47
+
48
+ # Find row gutters and panel rows
49
+ panel_rows = self._find_panel_rows(dilated, row_thresh)
50
+
51
+ # Extract panels from each row
52
+ all_panels = []
53
+ for y1, y2 in panel_rows:
54
+ row_panels = self._extract_panels_from_row(dilated, y1, y2, col_thresh)
55
+ all_panels.extend(row_panels)
56
+
57
+ # Filter panels by size
58
+ filtered_panels = self._filter_panels_by_size(
59
+ all_panels, width, height, min_width_ratio, min_height_ratio, min_area_ratio
60
+ )
61
+
62
+ # Extract panel images and save
63
+ panel_images, panel_data = self._save_panels(
64
+ filtered_panels, original, width, height
65
+ )
66
+
67
+ return panel_images, panel_data
68
+
69
+ def _find_panel_rows(self, dilated: np.ndarray, row_thresh: int) -> List[Tuple[int, int]]:
70
+ """Find panel rows by analyzing horizontal black percentages."""
71
+ height, width = dilated.shape
72
+ row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
73
+
74
+ # Find row gutters
75
+ row_gutters = []
76
+ in_gutter = False
77
+ for y, percent_black in enumerate(row_black_percentage):
78
+ if percent_black >= row_thresh and not in_gutter:
79
+ start_row = y
80
+ in_gutter = True
81
+ elif percent_black < row_thresh and in_gutter:
82
+ end_row = y
83
+ row_gutters.append((start_row, end_row))
84
+ in_gutter = False
85
+
86
+ # Convert gutters to panel rows
87
+ panel_rows = []
88
+ prev_end = 0
89
+ for start, end in row_gutters:
90
+ if start - prev_end > 10: # Minimum row height
91
+ panel_rows.append((prev_end, start))
92
+ prev_end = end
93
+
94
+ if height - prev_end > 10:
95
+ panel_rows.append((prev_end, height))
96
+
97
+ return panel_rows
98
+
99
+ def _extract_panels_from_row(self, dilated: np.ndarray, y1: int, y2: int,
100
+ col_thresh: int) -> List[Tuple[int, int, int, int]]:
101
+ """Extract panels from a single row."""
102
+ width = dilated.shape[1]
103
+ row_slice = dilated[y1:y2, :]
104
+ col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
105
+
106
+ # Find column gutters
107
+ col_gutters = []
108
+ in_gutter = False
109
+ for x, percent_black in enumerate(col_black_percentage):
110
+ if percent_black >= col_thresh and not in_gutter:
111
+ start_col = x
112
+ in_gutter = True
113
+ elif percent_black < col_thresh and in_gutter:
114
+ end_col = x
115
+ col_gutters.append((start_col, end_col))
116
+ in_gutter = False
117
+
118
+ # Convert gutters to panel columns
119
+ panel_cols = []
120
+ prev_end = 0
121
+ for start, end in col_gutters:
122
+ if start - prev_end > 10: # Minimum column width
123
+ panel_cols.append((prev_end, start))
124
+ prev_end = end
125
+
126
+ if width - prev_end > 10:
127
+ panel_cols.append((prev_end, width))
128
+
129
+ return [(x1, y1, x2, y2) for x1, x2 in panel_cols]
130
+
131
+ def _filter_panels_by_size(self, panels: List[Tuple[int, int, int, int]],
132
+ width: int, height: int, min_width_ratio: float,
133
+ min_height_ratio: float, min_area_ratio: float) -> List[Tuple[int, int, int, int]]:
134
+ """Filter panels by size constraints."""
135
+ # Remove very small panels first
136
+ panels = [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
137
+ if (x2 - x1) * (y2 - y1) >= (width * height) * min_area_ratio]
138
+
139
+ if not panels:
140
+ return []
141
+
142
+ # Calculate average dimensions for smart filtering
143
+ panel_widths = [x2 - x1 for x1, _, x2, _ in panels]
144
+ panel_heights = [y2 - y1 for _, y1, _, y2 in panels]
145
+ avg_width = np.mean(panel_widths)
146
+ avg_height = np.mean(panel_heights)
147
+
148
+ min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
149
+ min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
150
+
151
+ return [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
152
+ if (x2 - x1) >= min_allowed_width and (y2 - y1) >= min_allowed_height]
153
+
154
+ def _save_panels(self, panels: List[Tuple[int, int, int, int]],
155
+ original: np.ndarray, width: int, height: int) -> Tuple[List[np.ndarray], List[PanelData]]:
156
+ """Save panel images and return panel data."""
157
+ visual_output = original.copy()
158
+ panel_images = []
159
+ panel_data = []
160
+
161
+ for idx, (x1, y1, x2, y2) in enumerate(panels, 1):
162
+ # Extract panel image
163
+ panel_img = original[y1:y2, x1:x2]
164
+ panel_images.append(panel_img)
165
+
166
+ # Create panel data
167
+ panel_info = PanelData.from_coordinates(x1, y1, x2, y2)
168
+ panel_data.append(panel_info)
169
+
170
+ # Save panel image
171
+ panel_path = f'{self.config.output_folder}/panel_{idx}.jpg'
172
+ cv2.imwrite(str(panel_path), panel_img)
173
+
174
+ # Draw visualization
175
+ cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
176
+ cv2.putText(visual_output, f"#{idx}", (x1+5, y1+25),
177
+ cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
178
+
179
+ # Save visualization
180
+ visual_path = f'{self.config.output_folder}/panels_visualization.jpg'
181
+ cv2.imwrite(str(visual_path), visual_output)
182
+
183
+ print(f"✅ Extracted {len(panels)} panels after filtering.")
184
+ return panel_images, panel_data
comic_panel_extractor/text_detector.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import os
4
+ from typing import List, Optional
5
+ from dataclasses import dataclass
6
+ import numpy as np
7
+
8
+ from .config import Config
9
+
10
+ @dataclass
11
+ class TextDetection:
12
+ """Represents a detected text region."""
13
+ bbox: List[int]
14
+ text: str
15
+ confidence: float
16
+ id: Optional[int] = None
17
+
18
+ class TextDetector:
19
+ """Handles text detection and grouping from comic images."""
20
+
21
+ def __init__(self, config: Config):
22
+ self.config = config
23
+ self.reader = None
24
+
25
+ def load(self):
26
+ """Load the OCR reader."""
27
+ if self.reader is None:
28
+ import easyocr
29
+ self.reader = easyocr.Reader(['en'])
30
+
31
+ def detect_text(self) -> List[TextDetection]:
32
+ """Detect text regions in the image."""
33
+ self.load()
34
+ results = self.reader.readtext(self.config.input_path)
35
+ print(f"EasyOCR found {len(results)} raw detections")
36
+
37
+ detections = []
38
+ for box, text, confidence in results:
39
+ bbox = self._normalize_bbox(box)
40
+ detections.append(TextDetection(
41
+ bbox=bbox,
42
+ text=text.strip(),
43
+ confidence=float(confidence)
44
+ ))
45
+
46
+ return detections
47
+
48
+ def _normalize_bbox(self, box: List[List[int]]) -> List[int]:
49
+ """Convert box coordinates to normalized bbox format."""
50
+ return [
51
+ min(x[0] for x in box),
52
+ min(x[1] for x in box),
53
+ max(x[0] for x in box),
54
+ max(x[1] for x in box)
55
+ ]
56
+
57
+ @staticmethod
58
+ def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
59
+ """Calculate Euclidean distance between two bounding box centers."""
60
+ center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
61
+ center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
62
+ return np.linalg.norm(np.subtract(center1, center2))
63
+
64
+ def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
65
+ """Group nearby text regions into speech bubbles."""
66
+ # Filter out single character detections
67
+ filtered_detections = [
68
+ det for det in detections
69
+ if len(det.text.strip()) >= self.config.min_text_length
70
+ ]
71
+
72
+ # Sort by vertical position (top to bottom)
73
+ filtered_detections.sort(key=lambda d: d.bbox[1])
74
+
75
+ groups = []
76
+ for detection in filtered_detections:
77
+ merged = False
78
+
79
+ for group in groups:
80
+ if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
81
+ self._merge_detections(group, detection)
82
+ merged = True
83
+ break
84
+
85
+ if not merged:
86
+ groups.append(detection)
87
+
88
+ # Sort groups by vertical position and assign IDs
89
+ groups.sort(key=lambda g: g.bbox[1])
90
+ for idx, group in enumerate(groups):
91
+ group.id = idx + 1
92
+
93
+ return groups
94
+
95
+ def _merge_detections(self, group: TextDetection, detection: TextDetection):
96
+ """Merge two text detections."""
97
+ group.text += " " + detection.text
98
+ group.bbox = [
99
+ min(group.bbox[0], detection.bbox[0]),
100
+ min(group.bbox[1], detection.bbox[1]),
101
+ max(group.bbox[2], detection.bbox[2]),
102
+ max(group.bbox[3], detection.bbox[3])
103
+ ]
104
+
105
+ def detect_and_group_text(self) -> str:
106
+ """Main method to detect and group text, saving results to JSON."""
107
+ if not os.path.exists(self.config.text_cood_path):
108
+ detections = self.detect_text()
109
+ groups = self.group_text_regions(detections)
110
+ self._save_groups_to_json(groups, self.config.text_cood_path)
111
+ print(f"Grouped bubbles saved: {self.config.text_cood_path}")
112
+
113
+ return self.config.text_cood_path
114
+
115
+ def _save_groups_to_json(self, groups: List[TextDetection], output_path: str):
116
+ """Save grouped text detections to JSON file."""
117
+ groups_data = []
118
+ for group in groups:
119
+ groups_data.append({
120
+ "id": group.id,
121
+ "bbox": [int(x) for x in group.bbox],
122
+ "text": group.text,
123
+ "confidence": group.confidence
124
+ })
125
+
126
+ with open(output_path, "w", encoding="utf-8") as f:
127
+ json.dump(groups_data, f, indent=2, ensure_ascii=False)
128
+
129
+ def cleanup(self):
130
+ """Clean up resources."""
131
+ try:
132
+ if self.reader:
133
+ del self.reader
134
+ except:
135
+ pass
136
+
137
+ def __enter__(self):
138
+ return self
139
+
140
+ def __exit__(self, exc_type, exc_val, exc_tb):
141
+ self.cleanup()
142
+
143
+ def __del__(self):
144
+ self.cleanup()
setup.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
7
+ requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
8
+
9
+ setup(
10
+ name="comic-panel-extractor",
11
+ version="0.1.0",
12
+ author="Jebin Einstein E",
13
+ author_email="jebineinstein@gmail.com",
14
+ description="A tool for extracting panels from comic book images",
15
+ long_description=long_description,
16
+ long_description_content_type="text/markdown",
17
+ url="https://github.com/jebin2/comic-panel-extractor",
18
+ packages=find_packages(),
19
+ classifiers=[
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Operating System :: OS Independent",
24
+ "Programming Language :: Python :: 3",
25
+ "Programming Language :: Python :: 3.8",
26
+ "Programming Language :: Python :: 3.9",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ ],
30
+ python_requires=">=3.10",
31
+ install_requires=requirements,
32
+ entry_points={
33
+ "console_scripts": [
34
+ "comic-panel-extractor=comic_panel_extractor.cli:main",
35
+ ],
36
+ },
37
+ include_package_data=True,
38
+ zip_safe=False,
39
+ )
text_detector.py DELETED
@@ -1,148 +0,0 @@
1
- import json
2
- from typing import List, Tuple, Optional
3
- from dataclasses import dataclass
4
- import os
5
-
6
- import numpy as np
7
- from moviepy.editor import *
8
-
9
- @dataclass
10
- class Config:
11
- """Configuration settings for the comic-to-video pipeline."""
12
- main_file_name: str = ""
13
- comic_image: str = ""
14
- temp_folder: str = ""
15
- distance_threshold: int = 70
16
- vertical_threshold: int = 30
17
- tts_engine: str = "chatterbox"
18
- resolution: Tuple[int, int] = (1920, 1080)
19
- margin_ratio: float = 0.08
20
- auto_scroll: bool = True
21
- zoom_enabled: bool = False
22
- zoom_factor: float = 1.1
23
- output_video: str = "comic_text.mp4"
24
- min_text_length: int = 2
25
-
26
-
27
- @dataclass
28
- class TextDetection:
29
- """Represents a detected text region."""
30
- bbox: List[int]
31
- text: str
32
- confidence: float
33
- id: Optional[int] = None
34
-
35
- class TextDetector:
36
- """Handles text detection and grouping from comic images."""
37
-
38
- def __init__(self, config: Config):
39
- self.config = config
40
-
41
- def load(self):
42
- import easyocr
43
- self.reader = easyocr.Reader(['en'])
44
-
45
- def detect_text(self, image_path: str) -> List[TextDetection]:
46
- """Detect text regions in the image."""
47
- self.load()
48
- results = self.reader.readtext(image_path)
49
- print(f"EasyOCR found {len(results)} raw detections")
50
-
51
- detections = []
52
- for box, text, confidence in results:
53
- bbox = [
54
- min(x[0] for x in box),
55
- min(x[1] for x in box),
56
- max(x[0] for x in box),
57
- max(x[1] for x in box)
58
- ]
59
- detections.append(TextDetection(
60
- bbox=bbox,
61
- text=text.strip(),
62
- confidence=float(confidence)
63
- ))
64
-
65
- return detections
66
-
67
- @staticmethod
68
- def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
69
- """Calculate Euclidean distance between two bounding box centers."""
70
- center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
71
- center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
72
- return np.linalg.norm(np.subtract(center1, center2))
73
-
74
- def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
75
- """Group nearby text regions into speech bubbles."""
76
- # Filter out single character detections
77
- filtered_detections = [
78
- det for det in detections
79
- if len(det.text.strip()) >= self.config.min_text_length
80
- ]
81
-
82
- # Sort by vertical position (top to bottom)
83
- filtered_detections.sort(key=lambda d: d.bbox[1])
84
-
85
- groups = []
86
- for detection in filtered_detections:
87
- added_to_group = False
88
-
89
- for group in groups:
90
- if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
91
- # Merge with existing group
92
- group.text += " " + detection.text
93
- group.bbox = [
94
- min(group.bbox[0], detection.bbox[0]),
95
- min(group.bbox[1], detection.bbox[1]),
96
- max(group.bbox[2], detection.bbox[2]),
97
- max(group.bbox[3], detection.bbox[3])
98
- ]
99
- added_to_group = True
100
- break
101
-
102
- if not added_to_group:
103
- groups.append(detection)
104
-
105
- # Sort groups by vertical position and assign IDs
106
- groups.sort(key=lambda g: g.bbox[1])
107
- for idx, group in enumerate(groups):
108
- group.id = idx + 1
109
-
110
- return groups
111
-
112
- def detect_and_group_text(self, image_path: str) -> str:
113
- """Main method to detect and group text, saving results to JSON."""
114
-
115
- # Save to JSON
116
- output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
117
- if not os.path.exists(output_path):
118
- detections = self.detect_text(image_path)
119
- groups = self.group_text_regions(detections)
120
- groups_data = []
121
- for group in groups:
122
- groups_data.append({
123
- "id": group.id,
124
- "bbox": [int(x) for x in group.bbox],
125
- "text": group.text,
126
- "confidence": group.confidence
127
- })
128
-
129
- with open(output_path, "w", encoding="utf-8") as f:
130
- json.dump(groups_data, f, indent=2, ensure_ascii=False)
131
-
132
- print(f"Grouped bubbles saved: {output_path}")
133
- return str(output_path)
134
-
135
- def cleanup(self):
136
- try:
137
- del self.reader
138
-
139
- except: pass
140
-
141
- def __enter__(self):
142
- return self
143
-
144
- def __exit__(self, exc_type, exc_val, exc_tb):
145
- self.cleanup()
146
-
147
- def __del__(self):
148
- self.cleanup()