manga_translation / utils /bubble_detect.py
qqwjq1981's picture
Update utils/bubble_detect.py
f70f4f3 verified
"""
Enhanced speech bubble detection for manga
"""
import cv2
import numpy as np
from shapely.geometry import Polygon
from shapely.ops import unary_union
def detect_speech_bubbles(img_pil, min_area=500, max_area=None, debug=False):
"""
Basic speech bubble detection using adaptive threshold + morphology.
Returns:
List of bubble polygons [(x,y), ...]
"""
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
h, w = gray.shape
if max_area is None:
max_area = (h * w) // 4 # bubbles should not be entire page
th = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
35,
10,
)
inv = 255 - th # bubbles β†’ white
kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7, 7))
cleaned = cv2.morphologyEx(inv, cv2.MORPH_CLOSE, kernel_close, iterations=2)
kernel_open = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel_open, iterations=1)
contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
bubbles = []
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area or area > max_area:
continue
x, y, bw, bh = cv2.boundingRect(cnt)
aspect_ratio = max(bw, bh) / (min(bw, bh) + 1)
if aspect_ratio > 5:
continue
perimeter = cv2.arcLength(cnt, True)
if perimeter == 0:
continue
circularity = 4 * np.pi * area / (perimeter * perimeter + 1)
epsilon = 0.01 * perimeter
approx = cv2.approxPolyDP(cnt, epsilon, True)
poly = [(int(p[0][0]), int(p[0][1])) for p in approx]
bubbles.append(poly)
print(f"🎈 detect_speech_bubbles: {len(bubbles)} candidates")
return bubbles
def detect_bubbles_heuristic(img_pil, min_area=500, debug=False):
# 1. Convert to OpenCv format
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
h, w = img.shape[:2]
# 2. HSV Masking (Bright regions)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lower_white = np.array([0, 0, 215])
upper_white = np.array([180, 40, 255])
mask = cv2.inRange(hsv, lower_white, upper_white)
# Clean up mask
kernel_close = np.ones((15, 15), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close)
kernel_open = np.ones((5, 5), np.uint8)
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
bubbles = []
# Pre-compute edge map for texture checking
# Canny detects text characters very well
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 100, 200)
for cnt in contours:
area = cv2.contourArea(cnt)
# --- Standard Geometric Filters ---
if area < min_area or area > (h * w * 0.4): continue
x, y, bw, bh = cv2.boundingRect(cnt)
aspect_ratio = float(bw) / bh
if aspect_ratio < 0.2 or aspect_ratio > 5.0: continue
hull = cv2.convexHull(cnt)
hull_area = cv2.contourArea(hull)
if hull_area == 0: continue
solidity = float(area) / hull_area
if solidity < 0.7: continue
# --- NEW: "Has Text?" Filter ---
# 1. Create a mask for just this current contour
curr_mask = np.zeros_like(gray)
cv2.drawContours(curr_mask, [cnt], -1, 255, -1)
# 2. Look at the Canny Edges INSIDE this contour
# Text creates a lot of high-frequency edges. A plain white shirt does not.
bubble_edges = cv2.bitwise_and(edges, edges, mask=curr_mask)
edge_pixel_count = cv2.countNonZero(bubble_edges)
# Density = Edge Pixels / Total Area
# Typical text bubbles have density > 0.02 (2%)
# Empty white walls usually have density < 0.01
density = edge_pixel_count / area
if density < 0.015:
if debug: print(f"Skipping white blob (Empty): density={density:.4f}")
continue
# Simplify shape and add
epsilon = 0.005 * cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, epsilon, True)
poly = [(int(p[0][0]), int(p[0][1])) for p in approx]
bubbles.append(poly)
print(f"🎈 Heuristic Bubbles (HSV + TextCheck): {len(bubbles)}")
return bubbles
def merge_overlapping_bubbles(bubbles, iou_threshold=0.3):
"""
Merge bubbles that overlap significantly.
"""
if len(bubbles) <= 1:
return bubbles
shapes = []
for b in bubbles:
try:
p = Polygon(b)
if not p.is_valid:
p = p.buffer(0)
shapes.append(p)
except Exception:
continue
merged_polys = []
used = set()
for i, s1 in enumerate(shapes):
if i in used:
continue
group = [s1]
used.add(i)
for j, s2 in enumerate(shapes[i + 1 :], start=i + 1):
if j in used:
continue
inter = s1.intersection(s2).area
union = s1.union(s2).area
iou = inter / union if union > 0 else 0.0
if iou > iou_threshold:
group.append(s2)
used.add(j)
merged_shape = unary_union(group)
if merged_shape.geom_type == "Polygon":
merged_polys.append([(int(x), int(y)) for x, y in merged_shape.exterior.coords[:-1]])
else:
for g in merged_shape.geoms:
if g.geom_type == "Polygon":
merged_polys.append([(int(x), int(y)) for x, y in g.exterior.coords[:-1]])
print(f"πŸ”„ merge_overlapping_bubbles: {len(bubbles)} β†’ {len(merged_polys)}")
return merged_polys
def filter_nested_bubbles(bubbles):
"""
Remove bubbles completely inside other bubbles; keep larger ones.
"""
if len(bubbles) <= 1:
return bubbles
shapes = []
for b in bubbles:
try:
p = Polygon(b)
if not p.is_valid:
p = p.buffer(0)
shapes.append((p, b))
except Exception:
continue
shapes.sort(key=lambda x: x[0].area, reverse=True)
filtered = []
for i, (s1, poly1) in enumerate(shapes):
is_nested = False
for j, (s2, poly2) in enumerate(shapes):
if i == j:
continue
if s2.contains(s1):
is_nested = True
break
if not is_nested:
filtered.append(poly1)
if len(filtered) < len(bubbles):
print(f"πŸ—‘οΈ filter_nested_bubbles: removed {len(bubbles) - len(filtered)} nested")
return filtered
def detect_speech_bubbles_robust(img_pil, min_area=500, merge_overlaps=True, filter_nested_flag=True):
"""
Robust bubble detection with post-processing.
This is the recommended function to use.
"""
bubbles = detect_bubbles_heuristic(img_pil, min_area=min_area)
if not bubbles:
print("⚠️ detect_speech_bubbles_robust: no initial bubbles")
return []
if merge_overlaps:
bubbles = merge_overlapping_bubbles(bubbles)
if filter_nested_flag:
bubbles = filter_nested_bubbles(bubbles)
print(f"βœ… detect_speech_bubbles_robust: final {len(bubbles)} bubbles")
return bubbles