Spaces:

jebin2
/

comic-panel-extractor

Running

App Files Files Community

jebin2 commited on Jul 17

Commit

ec6ad2f

1 Parent(s): a7da787

mv as pck

Browse files

Files changed (11) hide show

.gitignore +1 -0
ComicPanelExtractor.py +0 -239
comic_panel_extractor/__init__.py +16 -0
comic_panel_extractor/cli.py +88 -0
comic_panel_extractor/config.py +11 -0
comic_panel_extractor/image_processor.py +84 -0
comic_panel_extractor/main.py +57 -0
comic_panel_extractor/panel_extractor.py +184 -0
comic_panel_extractor/text_detector.py +144 -0
setup.py +39 -0
text_detector.py +0 -148

.gitignore CHANGED Viewed

@@ -205,3 +205,4 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/

 marimo/_static/
 marimo/_lsp/
 __marimo__/
+temp_dir

ComicPanelExtractor.py DELETED Viewed

@@ -1,239 +0,0 @@
-import numpy as np
-import os
-import json
-from text_detector import TextDetector, Config as CVP_Config
-import cv2
-import shutil
-# ----------------------------------------------------------
-# MASK TEXT REGIONS
-# ----------------------------------------------------------
-def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
-	"""
-	Make the text regions in an image white (or given color) to reduce panel extraction noise.
-	Args:
-		image_path (str): Path to the input image.
-		bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
-		output_path (str, optional): Path to save the modified image.
-		color (tuple): Color to fill the bounding boxes (default black).
-	Returns:
-		masked_image (numpy array): Image with masked text regions.
-	"""
-	image = cv2.imread(image_path)
-	if image is None:
-		raise Exception(f"Could not load image: {image_path}")
-	for bbox in bboxes:
-		x1, y1, x2, y2 = bbox
-		cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1)  # Fill rectangle
-	if output_path:
-		cv2.imwrite(output_path, image)
-		print(f"✅ Text-masked image saved to: {output_path}")
-	return image
-# ----------------------------------------------------------
-# PRE PROCESS METHOD
-# ----------------------------------------------------------
-def pre_process(image_path, output_dir):
-	if not os.path.exists(output_dir):
-		os.makedirs(output_dir)
-	# Load and preprocess image
-	image = cv2.imread(image_path)
-	if image is None:
-		raise Exception(f"Could not load image: {image_path}")
-	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-	_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
-	# Dilate to strengthen borders and fill small gaps
-	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
-	dilated = cv2.dilate(binary, kernel, iterations=2)
-	cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
-	cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
-	cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
-# ----------------------------------------------------------
-# CLEAN DILATED IMAGE
-# ----------------------------------------------------------
-def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
-	"""
-	Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
-	with preference to clean rows that have fewer black pixels.
-	"""
-	dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
-	if dilated is None:
-		raise Exception("Could not load dilated image.")
-	binary = (dilated == 0).astype(np.uint8)
-	padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
-	cleaned = binary.copy()
-	height, width = binary.shape
-	row_black_counts = np.sum(binary, axis=1)
-	for y in range(1, height + 1):
-		for x in range(1, width + 1):
-			if padded[y, x] == 1:
-				neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
-				if neighbors > max_neighbors:
-					neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
-					if neighbor_rows:
-						row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
-						if y == row_to_clear:
-							cleaned[y-1, x-1] = 0
-	cleaned_img = (1 - cleaned) * 255
-	cv2.imwrite(output_path, cleaned_img)
-	print(f"✅ Cleaned dilated image saved to: {output_path}")
-	return output_path
-# ----------------------------------------------------------
-# EXTRACT PANELS - BLACK PERCENTAGE METHOD
-# ----------------------------------------------------------
-def extract_panels_by_black_percentage_fixed(
-	dilated_path, original_image_path, output_dir,
-	row_thresh=20, col_thresh=20,
-	min_width_ratio=0.1, min_height_ratio=0.1
-):
-	"""
-	Extract comic panels using black percentage scan with smart width & height filtering.
-	"""
-	if not os.path.exists(output_dir):
-		os.makedirs(output_dir)
-	dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
-	original = cv2.imread(original_image_path)
-	if dilated is None or original is None:
-		raise Exception("Could not load dilated or original image.")
-	height, width = dilated.shape
-	visual_output = original.copy()
-	# Detect row gutters
-	row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
-	row_gutters, panel_rows = [], []
-	in_gutter = False
-	for y, percent_black in enumerate(row_black_percentage):
-		if percent_black >= row_thresh and not in_gutter:
-			start_row = y
-			in_gutter = True
-		elif percent_black < row_thresh and in_gutter:
-			end_row = y
-			row_gutters.append((start_row, end_row))
-			in_gutter = False
-	prev_end = 0
-	for start, end in row_gutters:
-		if start - prev_end > 10:
-			panel_rows.append((prev_end, start))
-		prev_end = end
-	if height - prev_end > 10:
-		panel_rows.append((prev_end, height))
-	# Extract panels
-	all_panels, panel_count, panel_images, panel_points = [], 0, [], []
-	for y1, y2 in panel_rows:
-		row_slice = dilated[y1:y2, :]
-		col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
-		col_gutters, panel_cols = [], []
-		in_gutter_col = False
-		for x, percent_black in enumerate(col_black_percentage):
-			if percent_black >= col_thresh and not in_gutter_col:
-				start_col = x
-				in_gutter_col = True
-			elif percent_black < col_thresh and in_gutter_col:
-				end_col = x
-				col_gutters.append((start_col, end_col))
-				in_gutter_col = False
-		prev_end_col = 0
-		for start, end in col_gutters:
-			if start - prev_end_col > 10:
-				panel_cols.append((prev_end_col, start))
-			prev_end_col = end
-		if width - prev_end_col > 10:
-			panel_cols.append((prev_end_col, width))
-		for x1, x2 in panel_cols:
-			w, h = x2 - x1, y2 - y1
-			if w * h < (width * height) * 0.005:
-				continue
-			all_panels.append((x1, y1, x2, y2))
-	# Post-filter
-	panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
-	panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
-	avg_width = np.mean(panel_widths) if panel_widths else 0
-	avg_height = np.mean(panel_heights) if panel_heights else 0
-	min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
-	min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
-	for x1, y1, x2, y2 in all_panels:
-		panel_width, panel_height = x2 - x1, y2 - y1
-		if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
-			panel = original[y1:y2, x1:x2]
-			panel_count += 1
-			panel_images.append(panel)
-			panel_points.append({
-				"x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
-			})
-			panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
-			cv2.imwrite(panel_path, panel)
-			cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
-			cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
-						cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
-	print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
-	return output_dir, panel_images, panel_points
-# ----------------------------------------------------------
-# MAIN EXECUTION
-# ----------------------------------------------------------
-if __name__ == "__main__":
-	image_path = "input.jpg"
-	output_dir = "extracted_panels"
-	shutil.rmtree(output_dir, ignore_errors=True)
-	os.makedirs(output_dir, exist_ok=True)
-	# Detect and mask text regions
-	cvp_config = CVP_Config()
-	cvp_config.main_file_name = image_path
-	cvp_config.temp_folder = output_dir
-	cvp_config.comic_image = image_path
-	cvp_config.output_video = f"{output_dir}/test.mp4"
-	with TextDetector(cvp_config) as text_detector:
-		bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
-	with open(bubbles_path, "r", encoding="utf-8") as f:
-		bubbles = json.load(f)
-	output_path = os.path.join(output_dir, "1_text_removed.jpg")
-	masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
-	pre_process(output_path, output_dir)
-	# Clean dilated image
-	dilated_path = os.path.join(output_dir, "4_dilated.jpg")
-	cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
-	clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
-	# Extract panels - black percentage
-	extract_panels_by_black_percentage_fixed(
-		cleaned_dilated_path,
-		image_path,
-		output_dir,
-		min_width_ratio=0.1,  # Panels must be at least 10% of total width
-	)

comic_panel_extractor/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .main import ComicPanelExtractor
+from .config import Config
+from .text_detector import TextDetector, TextDetection
+from .image_processor import ImageProcessor
+from .panel_extractor import PanelExtractor, PanelData
+__version__ = "0.1.0"
+__all__ = [
+    "ComicPanelExtractor",
+    "Config",
+    "TextDetector",
+    "TextDetection",
+    "ImageProcessor",
+    "PanelExtractor",
+    "PanelData"
+]

comic_panel_extractor/cli.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+"""
+Command-line interface for comic panel extraction.
+"""
+import argparse
+import sys
+import json
+from typing import Optional, List
+from .main import ComicPanelExtractor
+from .config import Config
+class ComicPanelCLI:
+	"""Command-line interface for comic panel extraction."""
+	def __init__(self):
+		self.parser = self._create_parser()
+	def _create_parser(self) -> argparse.ArgumentParser:
+		"""Create argument parser."""
+		parser = argparse.ArgumentParser(
+			prog="comic-extract",
+			description="Extract panels from comic book images using OCR and image processing",
+			formatter_class=argparse.RawDescriptionHelpFormatter,
+			epilog="""
+Examples:
+  comic-extract comic.jpg
+  comic-extract comic.jpg --config config.json
+			"""
+		)
+		# Required arguments
+		parser.add_argument(
+			"input_path",
+			help="Path to the comic image file"
+		)
+		# Configuration file
+		parser.add_argument(
+			"--config",
+			help="Path to JSON configuration file"
+		)
+		return parser
+	def run(self, args: Optional[List[str]] = None) -> int:
+		"""Main CLI entry point."""
+		try:
+			parsed_args = self.parser.parse_args(args)
+			# Load configuration
+			config = self._load_config(parsed_args)
+			ComicPanelExtractor(config).extract_panels_from_comic()
+		except Exception as e:
+			print(f"❌ Error: {e}", file=sys.stderr)
+			return 1
+	def _load_config(self, args: argparse.Namespace) -> Config:
+		"""Load configuration from file or create from arguments."""
+		config = Config()
+		# Load from config file if provided
+		if args.config:
+			try:
+				with open(args.config, 'r', encoding='utf-8') as f:
+					config_data = json.load(f)
+					for key, value in config_data.items():
+						if hasattr(config, key):
+							setattr(config, key, value)
+				if args.verbose:
+					print(f"📄 Loaded configuration from: {args.config}")
+			except Exception as e:
+				print(f"⚠️  Warning: Could not load config file: {e}", file=sys.stderr)
+		# Override with command line arguments
+		config.input_path = args.input_path
+		return config
+def main():
+	"""Main entry point for CLI."""
+	cli = ComicPanelCLI()
+	sys.exit(cli.run())
+if __name__ == "__main__":
+	main()

comic_panel_extractor/config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from dataclasses import dataclass
+@dataclass
+class Config:
+    """Configuration settings for the comic-to-video pipeline."""
+    input_path: str = ""
+    output_folder: str = "temp_dir"
+    distance_threshold: int = 70
+    vertical_threshold: int = 30
+    text_cood_path: str = f"{output_folder}/detect_and_group_text.json"
+    min_text_length: int = 2

comic_panel_extractor/image_processor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import List, Tuple
+from pathlib import Path
+from .config import Config
+import numpy as np
+import cv2
+class ImageProcessor:
+    """Handles image preprocessing operations."""
+    def __init__(self, config: Config):
+        self.config = config
+    def mask_text_regions(self, bboxes: List[List[int]], output_filename: str = "1_text_removed.jpg", color: Tuple[int, int, int] = (0, 0, 0)) -> str:
+        """Mask text regions in the image to reduce panel extraction noise."""
+        image = cv2.imread(self.config.input_path)
+        if image is None:
+            raise FileNotFoundError(f"Could not load image: {self.config.input_path}")
+        for bbox in bboxes:
+            x1, y1, x2, y2 = bbox
+            cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1)
+        output_path = f'{self.config.output_folder}/{output_filename}'
+        cv2.imwrite(output_path, image)
+        print(f"✅ Text-masked image saved to: {output_path}")
+        return str(output_path)
+    def preprocess_image(self, masked_image_path) -> Tuple[str, str, str]:
+        """Preprocess image for panel extraction."""
+        image = cv2.imread(masked_image_path)
+        if image is None:
+            raise FileNotFoundError(f"Could not load image: {masked_image_path}")
+        # Convert to grayscale and binary
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
+        # Dilate to strengthen borders
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+        dilated = cv2.dilate(binary, kernel, iterations=2)
+        # Save intermediate results
+        gray_path = f'{self.config.output_folder}/2_gray.jpg'
+        binary_path = f'{self.config.output_folder}/3_binary.jpg'
+        dilated_path = f'{self.config.output_folder}/4_dilated.jpg'
+        cv2.imwrite(str(gray_path), gray)
+        cv2.imwrite(str(binary_path), binary)
+        cv2.imwrite(str(dilated_path), dilated)
+        return str(gray_path), str(binary_path), str(dilated_path)
+    def clean_dilated_image(self, dilated_path: str,
+                           output_filename: str = "5_dilated_cleaned.jpg",
+                           max_neighbors: int = 2) -> str:
+        """Clean dilated image by thinning thick borders."""
+        dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
+        if dilated is None:
+            raise FileNotFoundError(f"Could not load dilated image: {dilated_path}")
+        binary = (dilated == 0).astype(np.uint8)
+        padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
+        cleaned = binary.copy()
+        height, width = binary.shape
+        row_black_counts = np.sum(binary, axis=1)
+        for y in range(1, height + 1):
+            for x in range(1, width + 1):
+                if padded[y, x] == 1:
+                    neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
+                    if neighbors > max_neighbors:
+                        neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
+                        if neighbor_rows:
+                            row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
+                            if y == row_to_clear:
+                                cleaned[y-1, x-1] = 0
+        cleaned_img = (1 - cleaned) * 255
+        output_path = f'{self.config.output_folder}/{output_filename}'
+        cv2.imwrite(str(output_path), cleaned_img)
+        print(f"✅ Cleaned dilated image saved to: {output_path}")
+        return str(output_path)

comic_panel_extractor/main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from .text_detector import TextDetector
+from .config import Config
+from .image_processor import ImageProcessor
+from .panel_extractor import PanelData
+from .panel_extractor import PanelExtractor
+from typing import List, Tuple
+from pathlib import Path
+import numpy as np
+import json
+import shutil
+class ComicPanelExtractor:
+    """Main class that orchestrates the comic panel extraction process."""
+    def __init__(self, config: Config):
+        self.config = config
+        if Path(self.config.output_folder).exists():
+            shutil.rmtree(self.config.output_folder)
+        Path(self.config.output_folder).mkdir(exist_ok=True)
+        self.image_processor = ImageProcessor(self.config)
+        self.panel_extractor = PanelExtractor(self.config)
+    def extract_panels_from_comic(self) -> Tuple[List[np.ndarray], List[PanelData]]:
+        """Complete pipeline to extract panels from a comic image."""
+        print(f"Starting panel extraction for: {self.config.input_path}")
+        # Step 1: Detect and mask text regions
+        text_bubbles = self._detect_text_bubbles()
+        masked_image_path = self.image_processor.mask_text_regions([bubble["bbox"] for bubble in text_bubbles])
+        # Step 2: Preprocess image
+        _, _, dilated_path = self.image_processor.preprocess_image(masked_image_path)
+        # Step 3: Clean dilated image
+        cleaned_path = self.image_processor.clean_dilated_image(dilated_path)
+        # Step 4: Extract panels
+        panel_images, panel_data = self.panel_extractor.extract_panels(
+            cleaned_path, min_width_ratio=0.1
+        )
+        return panel_images, panel_data
+    def _detect_text_bubbles(self) -> List[dict]:
+        """Detect text bubbles in the comic image."""
+        with TextDetector(self.config) as text_detector:
+            bubbles_path = text_detector.detect_and_group_text()
+        with open(bubbles_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def cleanup(self):
+        """Clean up temporary files if needed."""
+        # Add cleanup logic here if needed
+        pass

comic_panel_extractor/panel_extractor.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from typing import List, Tuple
+from .config import Config
+import numpy as np
+import cv2
+from dataclasses import dataclass
+@dataclass
+class PanelData:
+    """Represents an extracted comic panel."""
+    x_start: int
+    y_start: int
+    x_end: int
+    y_end: int
+    width: int
+    height: int
+    area: int
+    @classmethod
+    def from_coordinates(cls, x1: int, y1: int, x2: int, y2: int) -> 'PanelData':
+        """Create PanelData from coordinates."""
+        return cls(
+            x_start=x1,
+            y_start=y1,
+            x_end=x2,
+            y_end=y2,
+            width=x2 - x1,
+            height=y2 - y1,
+            area=(x2 - x1) * (y2 - y1)
+        )
+class PanelExtractor:
+    """Handles comic panel extraction using black percentage analysis."""
+    def __init__(self, config: Config):
+        self.config = config
+    def extract_panels(self, dilated_path: str, row_thresh: int = 20, col_thresh: int = 20, min_width_ratio: float = 0.1, min_height_ratio: float = 0.1, min_area_ratio: float = 0.005) -> Tuple[List[np.ndarray], List[PanelData]]:
+        """Extract comic panels using black percentage scan."""
+        dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
+        original = cv2.imread(self.config.input_path)
+        if dilated is None or original is None:
+            raise FileNotFoundError("Could not load dilated or original image")
+        height, width = dilated.shape
+        # Find row gutters and panel rows
+        panel_rows = self._find_panel_rows(dilated, row_thresh)
+        # Extract panels from each row
+        all_panels = []
+        for y1, y2 in panel_rows:
+            row_panels = self._extract_panels_from_row(dilated, y1, y2, col_thresh)
+            all_panels.extend(row_panels)
+        # Filter panels by size
+        filtered_panels = self._filter_panels_by_size(
+            all_panels, width, height, min_width_ratio, min_height_ratio, min_area_ratio
+        )
+        # Extract panel images and save
+        panel_images, panel_data = self._save_panels(
+            filtered_panels, original, width, height
+        )
+        return panel_images, panel_data
+    def _find_panel_rows(self, dilated: np.ndarray, row_thresh: int) -> List[Tuple[int, int]]:
+        """Find panel rows by analyzing horizontal black percentages."""
+        height, width = dilated.shape
+        row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
+        # Find row gutters
+        row_gutters = []
+        in_gutter = False
+        for y, percent_black in enumerate(row_black_percentage):
+            if percent_black >= row_thresh and not in_gutter:
+                start_row = y
+                in_gutter = True
+            elif percent_black < row_thresh and in_gutter:
+                end_row = y
+                row_gutters.append((start_row, end_row))
+                in_gutter = False
+        # Convert gutters to panel rows
+        panel_rows = []
+        prev_end = 0
+        for start, end in row_gutters:
+            if start - prev_end > 10:  # Minimum row height
+                panel_rows.append((prev_end, start))
+            prev_end = end
+        if height - prev_end > 10:
+            panel_rows.append((prev_end, height))
+        return panel_rows
+    def _extract_panels_from_row(self, dilated: np.ndarray, y1: int, y2: int,
+                                col_thresh: int) -> List[Tuple[int, int, int, int]]:
+        """Extract panels from a single row."""
+        width = dilated.shape[1]
+        row_slice = dilated[y1:y2, :]
+        col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
+        # Find column gutters
+        col_gutters = []
+        in_gutter = False
+        for x, percent_black in enumerate(col_black_percentage):
+            if percent_black >= col_thresh and not in_gutter:
+                start_col = x
+                in_gutter = True
+            elif percent_black < col_thresh and in_gutter:
+                end_col = x
+                col_gutters.append((start_col, end_col))
+                in_gutter = False
+        # Convert gutters to panel columns
+        panel_cols = []
+        prev_end = 0
+        for start, end in col_gutters:
+            if start - prev_end > 10:  # Minimum column width
+                panel_cols.append((prev_end, start))
+            prev_end = end
+        if width - prev_end > 10:
+            panel_cols.append((prev_end, width))
+        return [(x1, y1, x2, y2) for x1, x2 in panel_cols]
+    def _filter_panels_by_size(self, panels: List[Tuple[int, int, int, int]],
+                              width: int, height: int, min_width_ratio: float,
+                              min_height_ratio: float, min_area_ratio: float) -> List[Tuple[int, int, int, int]]:
+        """Filter panels by size constraints."""
+        # Remove very small panels first
+        panels = [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
+                 if (x2 - x1) * (y2 - y1) >= (width * height) * min_area_ratio]
+        if not panels:
+            return []
+        # Calculate average dimensions for smart filtering
+        panel_widths = [x2 - x1 for x1, _, x2, _ in panels]
+        panel_heights = [y2 - y1 for _, y1, _, y2 in panels]
+        avg_width = np.mean(panel_widths)
+        avg_height = np.mean(panel_heights)
+        min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
+        min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
+        return [(x1, y1, x2, y2) for x1, y1, x2, y2 in panels
+                if (x2 - x1) >= min_allowed_width and (y2 - y1) >= min_allowed_height]
+    def _save_panels(self, panels: List[Tuple[int, int, int, int]],
+                    original: np.ndarray, width: int, height: int) -> Tuple[List[np.ndarray], List[PanelData]]:
+        """Save panel images and return panel data."""
+        visual_output = original.copy()
+        panel_images = []
+        panel_data = []
+        for idx, (x1, y1, x2, y2) in enumerate(panels, 1):
+            # Extract panel image
+            panel_img = original[y1:y2, x1:x2]
+            panel_images.append(panel_img)
+            # Create panel data
+            panel_info = PanelData.from_coordinates(x1, y1, x2, y2)
+            panel_data.append(panel_info)
+            # Save panel image
+            panel_path = f'{self.config.output_folder}/panel_{idx}.jpg'
+            cv2.imwrite(str(panel_path), panel_img)
+            # Draw visualization
+            cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            cv2.putText(visual_output, f"#{idx}", (x1+5, y1+25),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
+        # Save visualization
+        visual_path = f'{self.config.output_folder}/panels_visualization.jpg'
+        cv2.imwrite(str(visual_path), visual_output)
+        print(f"✅ Extracted {len(panels)} panels after filtering.")
+        return panel_images, panel_data

comic_panel_extractor/text_detector.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import json
+import os
+from typing import List, Optional
+from dataclasses import dataclass
+import numpy as np
+from .config import Config
+@dataclass
+class TextDetection:
+    """Represents a detected text region."""
+    bbox: List[int]
+    text: str
+    confidence: float
+    id: Optional[int] = None
+class TextDetector:
+    """Handles text detection and grouping from comic images."""
+    def __init__(self, config: Config):
+        self.config = config
+        self.reader = None
+    def load(self):
+        """Load the OCR reader."""
+        if self.reader is None:
+            import easyocr
+            self.reader = easyocr.Reader(['en'])
+    def detect_text(self) -> List[TextDetection]:
+        """Detect text regions in the image."""
+        self.load()
+        results = self.reader.readtext(self.config.input_path)
+        print(f"EasyOCR found {len(results)} raw detections")
+        detections = []
+        for box, text, confidence in results:
+            bbox = self._normalize_bbox(box)
+            detections.append(TextDetection(
+                bbox=bbox,
+                text=text.strip(),
+                confidence=float(confidence)
+            ))
+        return detections
+    def _normalize_bbox(self, box: List[List[int]]) -> List[int]:
+        """Convert box coordinates to normalized bbox format."""
+        return [
+            min(x[0] for x in box),
+            min(x[1] for x in box),
+            max(x[0] for x in box),
+            max(x[1] for x in box)
+        ]
+    @staticmethod
+    def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
+        """Calculate Euclidean distance between two bounding box centers."""
+        center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
+        center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
+        return np.linalg.norm(np.subtract(center1, center2))
+    def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
+        """Group nearby text regions into speech bubbles."""
+        # Filter out single character detections
+        filtered_detections = [
+            det for det in detections
+            if len(det.text.strip()) >= self.config.min_text_length
+        ]
+        # Sort by vertical position (top to bottom)
+        filtered_detections.sort(key=lambda d: d.bbox[1])
+        groups = []
+        for detection in filtered_detections:
+            merged = False
+            for group in groups:
+                if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
+                    self._merge_detections(group, detection)
+                    merged = True
+                    break
+            if not merged:
+                groups.append(detection)
+        # Sort groups by vertical position and assign IDs
+        groups.sort(key=lambda g: g.bbox[1])
+        for idx, group in enumerate(groups):
+            group.id = idx + 1
+        return groups
+    def _merge_detections(self, group: TextDetection, detection: TextDetection):
+        """Merge two text detections."""
+        group.text += " " + detection.text
+        group.bbox = [
+            min(group.bbox[0], detection.bbox[0]),
+            min(group.bbox[1], detection.bbox[1]),
+            max(group.bbox[2], detection.bbox[2]),
+            max(group.bbox[3], detection.bbox[3])
+        ]
+    def detect_and_group_text(self) -> str:
+        """Main method to detect and group text, saving results to JSON."""
+        if not os.path.exists(self.config.text_cood_path):
+            detections = self.detect_text()
+            groups = self.group_text_regions(detections)
+            self._save_groups_to_json(groups, self.config.text_cood_path)
+            print(f"Grouped bubbles saved: {self.config.text_cood_path}")
+        return self.config.text_cood_path
+    def _save_groups_to_json(self, groups: List[TextDetection], output_path: str):
+        """Save grouped text detections to JSON file."""
+        groups_data = []
+        for group in groups:
+            groups_data.append({
+                "id": group.id,
+                "bbox": [int(x) for x in group.bbox],
+                "text": group.text,
+                "confidence": group.confidence
+            })
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(groups_data, f, indent=2, ensure_ascii=False)
+    def cleanup(self):
+        """Clean up resources."""
+        try:
+            if self.reader:
+                del self.reader
+        except:
+            pass
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+    def __del__(self):
+        self.cleanup()

setup.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+with open("requirements.txt", "r", encoding="utf-8") as fh:
+    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
+setup(
+    name="comic-panel-extractor",
+    version="0.1.0",
+    author="Jebin Einstein E",
+    author_email="jebineinstein@gmail.com",
+    description="A tool for extracting panels from comic book images",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/jebin2/comic-panel-extractor",
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+    ],
+    python_requires=">=3.10",
+    install_requires=requirements,
+    entry_points={
+        "console_scripts": [
+            "comic-panel-extractor=comic_panel_extractor.cli:main",
+        ],
+    },
+    include_package_data=True,
+    zip_safe=False,
+)

text_detector.py DELETED Viewed

@@ -1,148 +0,0 @@
-import json
-from typing import List, Tuple, Optional
-from dataclasses import dataclass
-import os
-import numpy as np
-from moviepy.editor import *
-@dataclass
-class Config:
-	"""Configuration settings for the comic-to-video pipeline."""
-	main_file_name: str = ""
-	comic_image: str = ""
-	temp_folder: str = ""
-	distance_threshold: int = 70
-	vertical_threshold: int = 30
-	tts_engine: str = "chatterbox"
-	resolution: Tuple[int, int] = (1920, 1080)
-	margin_ratio: float = 0.08
-	auto_scroll: bool = True
-	zoom_enabled: bool = False
-	zoom_factor: float = 1.1
-	output_video: str = "comic_text.mp4"
-	min_text_length: int = 2
-@dataclass
-class TextDetection:
-	"""Represents a detected text region."""
-	bbox: List[int]
-	text: str
-	confidence: float
-	id: Optional[int] = None
-class TextDetector:
-	"""Handles text detection and grouping from comic images."""
-	def __init__(self, config: Config):
-		self.config = config
-	def load(self):
-		import easyocr
-		self.reader = easyocr.Reader(['en'])
-	def detect_text(self, image_path: str) -> List[TextDetection]:
-		"""Detect text regions in the image."""
-		self.load()
-		results = self.reader.readtext(image_path)
-		print(f"EasyOCR found {len(results)} raw detections")
-		detections = []
-		for box, text, confidence in results:
-			bbox = [
-				min(x[0] for x in box),
-				min(x[1] for x in box),
-				max(x[0] for x in box),
-				max(x[1] for x in box)
-			]
-			detections.append(TextDetection(
-				bbox=bbox,
-				text=text.strip(),
-				confidence=float(confidence)
-			))
-		return detections
-	@staticmethod
-	def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
-		"""Calculate Euclidean distance between two bounding box centers."""
-		center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
-		center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
-		return np.linalg.norm(np.subtract(center1, center2))
-	def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
-		"""Group nearby text regions into speech bubbles."""
-		# Filter out single character detections
-		filtered_detections = [
-			det for det in detections
-			if len(det.text.strip()) >= self.config.min_text_length
-		]
-		# Sort by vertical position (top to bottom)
-		filtered_detections.sort(key=lambda d: d.bbox[1])
-		groups = []
-		for detection in filtered_detections:
-			added_to_group = False
-			for group in groups:
-				if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
-					# Merge with existing group
-					group.text += " " + detection.text
-					group.bbox = [
-						min(group.bbox[0], detection.bbox[0]),
-						min(group.bbox[1], detection.bbox[1]),
-						max(group.bbox[2], detection.bbox[2]),
-						max(group.bbox[3], detection.bbox[3])
-					]
-					added_to_group = True
-					break
-			if not added_to_group:
-				groups.append(detection)
-		# Sort groups by vertical position and assign IDs
-		groups.sort(key=lambda g: g.bbox[1])
-		for idx, group in enumerate(groups):
-			group.id = idx + 1
-		return groups
-	def detect_and_group_text(self, image_path: str) -> str:
-		"""Main method to detect and group text, saving results to JSON."""
-		# Save to JSON
-		output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
-		if not os.path.exists(output_path):
-			detections = self.detect_text(image_path)
-			groups = self.group_text_regions(detections)
-			groups_data = []
-			for group in groups:
-				groups_data.append({
-					"id": group.id,
-					"bbox": [int(x) for x in group.bbox],
-					"text": group.text,
-					"confidence": group.confidence
-				})
-			with open(output_path, "w", encoding="utf-8") as f:
-				json.dump(groups_data, f, indent=2, ensure_ascii=False)
-			print(f"Grouped bubbles saved: {output_path}")
-		return str(output_path)
-	def cleanup(self):
-		try:
-			del self.reader
-		except: pass
-	def __enter__(self):
-		return self
-	def __exit__(self, exc_type, exc_val, exc_tb):
-		self.cleanup()
-	def __del__(self):
-		self.cleanup()