Spaces:

jebin2
/

comic-panel-extractor

Running

App Files Files Community

jebin2 commited on Jul 17

Commit

a7da787

1 Parent(s): 5327cbd

test workflow

Browse files

Files changed (3) hide show

ComicPanelExtractor.py +239 -0
requirements.txt +4 -0
text_detector.py +148 -0

ComicPanelExtractor.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import numpy as np
+import os
+import json
+from text_detector import TextDetector, Config as CVP_Config
+import cv2
+import shutil
+# ----------------------------------------------------------
+# MASK TEXT REGIONS
+# ----------------------------------------------------------
+def mask_text_regions(image_path, bboxes, output_path=None, color=(0, 0, 0)):
+	"""
+	Make the text regions in an image white (or given color) to reduce panel extraction noise.
+	Args:
+		image_path (str): Path to the input image.
+		bboxes (list of list): List of bounding boxes in [x1, y1, x2, y2] format.
+		output_path (str, optional): Path to save the modified image.
+		color (tuple): Color to fill the bounding boxes (default black).
+	Returns:
+		masked_image (numpy array): Image with masked text regions.
+	"""
+	image = cv2.imread(image_path)
+	if image is None:
+		raise Exception(f"Could not load image: {image_path}")
+	for bbox in bboxes:
+		x1, y1, x2, y2 = bbox
+		cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=-1)  # Fill rectangle
+	if output_path:
+		cv2.imwrite(output_path, image)
+		print(f"✅ Text-masked image saved to: {output_path}")
+	return image
+# ----------------------------------------------------------
+# PRE PROCESS METHOD
+# ----------------------------------------------------------
+def pre_process(image_path, output_dir):
+	if not os.path.exists(output_dir):
+		os.makedirs(output_dir)
+	# Load and preprocess image
+	image = cv2.imread(image_path)
+	if image is None:
+		raise Exception(f"Could not load image: {image_path}")
+	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+	_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
+	# Dilate to strengthen borders and fill small gaps
+	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+	dilated = cv2.dilate(binary, kernel, iterations=2)
+	cv2.imwrite(os.path.join(output_dir, "2_gray.jpg"), gray)
+	cv2.imwrite(os.path.join(output_dir, "3_binary.jpg"), binary)
+	cv2.imwrite(os.path.join(output_dir, "4_dilated.jpg"), dilated)
+# ----------------------------------------------------------
+# CLEAN DILATED IMAGE
+# ----------------------------------------------------------
+def clean_dilated_with_row_priority(dilated_path, output_path, max_neighbors=2):
+	"""
+	Clean a dilated comic page by thinning thick borders using Game-of-Life logic,
+	with preference to clean rows that have fewer black pixels.
+	"""
+	dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
+	if dilated is None:
+		raise Exception("Could not load dilated image.")
+	binary = (dilated == 0).astype(np.uint8)
+	padded = np.pad(binary, pad_width=1, mode="constant", constant_values=0)
+	cleaned = binary.copy()
+	height, width = binary.shape
+	row_black_counts = np.sum(binary, axis=1)
+	for y in range(1, height + 1):
+		for x in range(1, width + 1):
+			if padded[y, x] == 1:
+				neighbors = np.sum(padded[y-1:y+2, x-1:x+2]) - 1
+				if neighbors > max_neighbors:
+					neighbor_rows = [r for r in [y-1, y, y+1] if 1 <= r <= height]
+					if neighbor_rows:
+						row_to_clear = min(neighbor_rows, key=lambda r: row_black_counts[r-1])
+						if y == row_to_clear:
+							cleaned[y-1, x-1] = 0
+	cleaned_img = (1 - cleaned) * 255
+	cv2.imwrite(output_path, cleaned_img)
+	print(f"✅ Cleaned dilated image saved to: {output_path}")
+	return output_path
+# ----------------------------------------------------------
+# EXTRACT PANELS - BLACK PERCENTAGE METHOD
+# ----------------------------------------------------------
+def extract_panels_by_black_percentage_fixed(
+	dilated_path, original_image_path, output_dir,
+	row_thresh=20, col_thresh=20,
+	min_width_ratio=0.1, min_height_ratio=0.1
+):
+	"""
+	Extract comic panels using black percentage scan with smart width & height filtering.
+	"""
+	if not os.path.exists(output_dir):
+		os.makedirs(output_dir)
+	dilated = cv2.imread(dilated_path, cv2.IMREAD_GRAYSCALE)
+	original = cv2.imread(original_image_path)
+	if dilated is None or original is None:
+		raise Exception("Could not load dilated or original image.")
+	height, width = dilated.shape
+	visual_output = original.copy()
+	# Detect row gutters
+	row_black_percentage = np.sum(dilated == 0, axis=1) / width * 100
+	row_gutters, panel_rows = [], []
+	in_gutter = False
+	for y, percent_black in enumerate(row_black_percentage):
+		if percent_black >= row_thresh and not in_gutter:
+			start_row = y
+			in_gutter = True
+		elif percent_black < row_thresh and in_gutter:
+			end_row = y
+			row_gutters.append((start_row, end_row))
+			in_gutter = False
+	prev_end = 0
+	for start, end in row_gutters:
+		if start - prev_end > 10:
+			panel_rows.append((prev_end, start))
+		prev_end = end
+	if height - prev_end > 10:
+		panel_rows.append((prev_end, height))
+	# Extract panels
+	all_panels, panel_count, panel_images, panel_points = [], 0, [], []
+	for y1, y2 in panel_rows:
+		row_slice = dilated[y1:y2, :]
+		col_black_percentage = np.sum(row_slice == 0, axis=0) / (y2 - y1) * 100
+		col_gutters, panel_cols = [], []
+		in_gutter_col = False
+		for x, percent_black in enumerate(col_black_percentage):
+			if percent_black >= col_thresh and not in_gutter_col:
+				start_col = x
+				in_gutter_col = True
+			elif percent_black < col_thresh and in_gutter_col:
+				end_col = x
+				col_gutters.append((start_col, end_col))
+				in_gutter_col = False
+		prev_end_col = 0
+		for start, end in col_gutters:
+			if start - prev_end_col > 10:
+				panel_cols.append((prev_end_col, start))
+			prev_end_col = end
+		if width - prev_end_col > 10:
+			panel_cols.append((prev_end_col, width))
+		for x1, x2 in panel_cols:
+			w, h = x2 - x1, y2 - y1
+			if w * h < (width * height) * 0.005:
+				continue
+			all_panels.append((x1, y1, x2, y2))
+	# Post-filter
+	panel_widths = [x2 - x1 for x1, _, x2, _ in all_panels]
+	panel_heights = [y2 - y1 for _, y1, _, y2 in all_panels]
+	avg_width = np.mean(panel_widths) if panel_widths else 0
+	avg_height = np.mean(panel_heights) if panel_heights else 0
+	min_allowed_width = max(avg_width * 0.5, width * min_width_ratio)
+	min_allowed_height = max(avg_height * 0.5, height * min_height_ratio)
+	for x1, y1, x2, y2 in all_panels:
+		panel_width, panel_height = x2 - x1, y2 - y1
+		if panel_width >= min_allowed_width and panel_height >= min_allowed_height:
+			panel = original[y1:y2, x1:x2]
+			panel_count += 1
+			panel_images.append(panel)
+			panel_points.append({
+				"x_start": x1, "y_start": y1, "x_end": x2, "y_end": y2
+			})
+			panel_path = os.path.join(output_dir, f"panel_{panel_count}.jpg")
+			cv2.imwrite(panel_path, panel)
+			cv2.rectangle(visual_output, (x1, y1), (x2, y2), (0, 255, 0), 2)
+			cv2.putText(visual_output, f"#{panel_count}", (x1+5, y1+25),
+						cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
+	print(f"✅ Extracted {panel_count} panels after smart width & height filtering.")
+	return output_dir, panel_images, panel_points
+# ----------------------------------------------------------
+# MAIN EXECUTION
+# ----------------------------------------------------------
+if __name__ == "__main__":
+	image_path = "input.jpg"
+	output_dir = "extracted_panels"
+	shutil.rmtree(output_dir, ignore_errors=True)
+	os.makedirs(output_dir, exist_ok=True)
+	# Detect and mask text regions
+	cvp_config = CVP_Config()
+	cvp_config.main_file_name = image_path
+	cvp_config.temp_folder = output_dir
+	cvp_config.comic_image = image_path
+	cvp_config.output_video = f"{output_dir}/test.mp4"
+	with TextDetector(cvp_config) as text_detector:
+		bubbles_path = text_detector.detect_and_group_text(cvp_config.comic_image)
+	with open(bubbles_path, "r", encoding="utf-8") as f:
+		bubbles = json.load(f)
+	output_path = os.path.join(output_dir, "1_text_removed.jpg")
+	masked_image = mask_text_regions(image_path, [box["bbox"] for box in bubbles], output_path=output_path)
+	pre_process(output_path, output_dir)
+	# Clean dilated image
+	dilated_path = os.path.join(output_dir, "4_dilated.jpg")
+	cleaned_dilated_path = os.path.join(output_dir, "5_dilated_cleaned.jpg")
+	clean_dilated_with_row_priority(dilated_path, cleaned_dilated_path, max_neighbors=2)
+	# Extract panels - black percentage
+	extract_panels_by_black_percentage_fixed(
+		cleaned_dilated_path,
+		image_path,
+		output_dir,
+		min_width_ratio=0.1,  # Panels must be at least 10% of total width
+	)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+moviepy==1.0.3
+numpy
+opencv-python
+easyocr

text_detector.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import json
+from typing import List, Tuple, Optional
+from dataclasses import dataclass
+import os
+import numpy as np
+from moviepy.editor import *
+@dataclass
+class Config:
+	"""Configuration settings for the comic-to-video pipeline."""
+	main_file_name: str = ""
+	comic_image: str = ""
+	temp_folder: str = ""
+	distance_threshold: int = 70
+	vertical_threshold: int = 30
+	tts_engine: str = "chatterbox"
+	resolution: Tuple[int, int] = (1920, 1080)
+	margin_ratio: float = 0.08
+	auto_scroll: bool = True
+	zoom_enabled: bool = False
+	zoom_factor: float = 1.1
+	output_video: str = "comic_text.mp4"
+	min_text_length: int = 2
+@dataclass
+class TextDetection:
+	"""Represents a detected text region."""
+	bbox: List[int]
+	text: str
+	confidence: float
+	id: Optional[int] = None
+class TextDetector:
+	"""Handles text detection and grouping from comic images."""
+	def __init__(self, config: Config):
+		self.config = config
+	def load(self):
+		import easyocr
+		self.reader = easyocr.Reader(['en'])
+	def detect_text(self, image_path: str) -> List[TextDetection]:
+		"""Detect text regions in the image."""
+		self.load()
+		results = self.reader.readtext(image_path)
+		print(f"EasyOCR found {len(results)} raw detections")
+		detections = []
+		for box, text, confidence in results:
+			bbox = [
+				min(x[0] for x in box),
+				min(x[1] for x in box),
+				max(x[0] for x in box),
+				max(x[1] for x in box)
+			]
+			detections.append(TextDetection(
+				bbox=bbox,
+				text=text.strip(),
+				confidence=float(confidence)
+			))
+		return detections
+	@staticmethod
+	def calculate_distance(bbox1: List[int], bbox2: List[int]) -> float:
+		"""Calculate Euclidean distance between two bounding box centers."""
+		center1 = [(bbox1[0] + bbox1[2]) / 2, (bbox1[1] + bbox1[3]) / 2]
+		center2 = [(bbox2[0] + bbox2[2]) / 2, (bbox2[1] + bbox2[3]) / 2]
+		return np.linalg.norm(np.subtract(center1, center2))
+	def group_text_regions(self, detections: List[TextDetection]) -> List[TextDetection]:
+		"""Group nearby text regions into speech bubbles."""
+		# Filter out single character detections
+		filtered_detections = [
+			det for det in detections
+			if len(det.text.strip()) >= self.config.min_text_length
+		]
+		# Sort by vertical position (top to bottom)
+		filtered_detections.sort(key=lambda d: d.bbox[1])
+		groups = []
+		for detection in filtered_detections:
+			added_to_group = False
+			for group in groups:
+				if self.calculate_distance(detection.bbox, group.bbox) < self.config.distance_threshold:
+					# Merge with existing group
+					group.text += " " + detection.text
+					group.bbox = [
+						min(group.bbox[0], detection.bbox[0]),
+						min(group.bbox[1], detection.bbox[1]),
+						max(group.bbox[2], detection.bbox[2]),
+						max(group.bbox[3], detection.bbox[3])
+					]
+					added_to_group = True
+					break
+			if not added_to_group:
+				groups.append(detection)
+		# Sort groups by vertical position and assign IDs
+		groups.sort(key=lambda g: g.bbox[1])
+		for idx, group in enumerate(groups):
+			group.id = idx + 1
+		return groups
+	def detect_and_group_text(self, image_path: str) -> str:
+		"""Main method to detect and group text, saving results to JSON."""
+		# Save to JSON
+		output_path = self.config.output_video.replace(".mp4", "_detect_and_group_text.json")
+		if not os.path.exists(output_path):
+			detections = self.detect_text(image_path)
+			groups = self.group_text_regions(detections)
+			groups_data = []
+			for group in groups:
+				groups_data.append({
+					"id": group.id,
+					"bbox": [int(x) for x in group.bbox],
+					"text": group.text,
+					"confidence": group.confidence
+				})
+			with open(output_path, "w", encoding="utf-8") as f:
+				json.dump(groups_data, f, indent=2, ensure_ascii=False)
+			print(f"Grouped bubbles saved: {output_path}")
+		return str(output_path)
+	def cleanup(self):
+		try:
+			del self.reader
+		except: pass
+	def __enter__(self):
+		return self
+	def __exit__(self, exc_type, exc_val, exc_tb):
+		self.cleanup()
+	def __del__(self):
+		self.cleanup()