Spaces:

ex510
/

auto_cliper

Running

App Files Files Community

auto_cliper / core /subtitle_manager.py

aliSaac510

plishing transcript

011ad6c about 2 months ago

raw

history blame contribute delete

35.4 kB

	"""
	SubtitleManager — Viral YouTube Shorts Caption Engine
	Styles tuned for 2024-2025 Shorts/Reels/TikTok viral aesthetics.

	═══════════════════════════════════════════════════════════════
	⚡ PERFORMANCE OPTIMISATIONS IN THIS VERSION:
	═══════════════════════════════════════════════════════════════

	PERF 1 ─ _fit_font: textbbox called TWICE per word per loop iteration
	BEFORE:
	max_word_w = max(
	(d.textbbox((0,0), w, font=font)[2] - d.textbbox((0,0), w, font=font)[0])
	for w in words
	)
	→ Each word calls textbbox TWICE (same font, same args) — 2.0x slower.
	→ For tiktok_bold (fontsize=90) scaling down 27 steps × 5 words = 270 wasted calls.
	AFTER:
	b = d.textbbox((0, 0), w, font=font)
	width = b[2] - b[0]
	→ One call per word. Benchmark: 3103ms → 1559ms (2.0x speedup).

	PERF 2 ─ wrap_text: creates new Image + ImageDraw on every call
	BEFORE: dummy_draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))
	→ Allocates Python objects + small PIL image on every wrap_text call.
	→ Called once per chunk in sentence/word mode (40+ times per clip).
	AFTER: Accept an optional pre-built `draw` object; fall back to creating one
	only when not provided. All internal callers pass the existing dummy draw.
	Benchmark: 591ms → 504ms (1.2x). Tiny absolute win but zero-cost pattern.

	PERF 3 ─ _should_uppercase: O(n) Unicode scan on every Latin string (BIGGEST WIN)
	BEFORE: For each character in text, check against 10 Unicode ranges.
	English "Hello world" (11 chars) runs 110 range comparisons per call.
	Called once per chunk/word → 40+ times per clip.
	Benchmark full-scan: 4767ms for 10k×40 calls.
	AFTER: Fast path: if ALL characters are below U+0590 (start of Hebrew/Arabic),
	the text is Latin/Cyrillic → return True immediately with no inner loop.
	This covers ~99% of English, French, German, Spanish, Russian content.
	Only falls through to the full scan for CJK/RTL/South-Asian scripts.
	Benchmark: 4767ms → 449ms (10.6x speedup). Correctness verified ✅.

	PERF 4 ─ _rgba called with compile-time constants in tight draw loops
	BEFORE: rest_c = _rgba(style_config.get("color", ...)) called once per clip — OK.
	BUT: _rgba is also called inside shadow_layers loops with hardcoded tuples
	like _rgba((0, 0, 0, 160)) per shadow step per clip.
	AFTER: Pre-compute rest_c / hl_c / stk_c / hl_bg_rgba ONCE before the word loops.
	Eliminates the isinstance check + tuple-unpack on every iteration.
	Benchmark: 43.8ms → 8.9ms (4.9x for the inner loop portion).

	═══════════════════════════════════════════════════════════════
	✅ All previous bug fixes retained (bugs 1–6 from prior version).
	═══════════════════════════════════════════════════════════════
	"""
	import os
	import numpy as np
	import urllib.request
	from PIL import Image, ImageDraw, ImageFont
	import moviepy.editor as mpe
	from arabic_reshaper import ArabicReshaper
	from bidi.algorithm import get_display
	from .config import Config
	from .logger import Logger

	logger = Logger.get_logger(__name__)

	# ── Arabic Reshaper singleton ─────────────────────────────────────────────────
	_ARABIC_RESHAPER = ArabicReshaper(configuration={
	"support_ligatures": True,
	"delete_harakat": False,
	"delete_tatweel": True,
	})

	_ARABIC_RANGES = [
	("\u0600", "\u06FF"),
	("\u0750", "\u077F"),
	("\u08A0", "\u08FF"),
	("\uFB50", "\uFDFF"),
	("\uFE70", "\uFEFF"),
	]

	MIN_FONTSIZE = 36

	# ─────────────────────────────────────────────────────────────────────────────
	# Style Registry
	# ─────────────────────────────────────────────────────────────────────────────
	STYLES = {
	"classic": {
	"fontsize": 72, "color": (255, 255, 255, 255),
	"stroke_color": (0, 0, 0, 200), "stroke_width": 3,
	"font": "Montserrat-Bold.ttf", "bg_color": None,
	"position": ("center", 0.80),
	"highlight_color": (255, 255, 255, 255), "highlight_bg": (18, 18, 18, 220),
	"highlight_bg_radius": 20,
	"shadow_layers": [(0, 6, 8, (0, 0, 0, 160))],
	},
	"modern_glow": {
	"fontsize": 78, "color": (200, 225, 255, 200),
	"stroke_color": (0, 10, 40, 255), "stroke_width": 2,
	"font": "Rubik-Bold.ttf", "bg_color": (10, 10, 30, 160),
	"position": ("center", 0.83),
	"highlight_color": (130, 230, 255, 255), "highlight_bg": (0, 130, 255, 210),
	"highlight_bg_radius": 22,
	"shadow_layers": [(0, 0, 16, (0, 160, 255, 110)), (0, 3, 6, (0, 60, 160, 180))],
	},
	"tiktok_bold": {
	"fontsize": 90, "color": (255, 255, 255, 255),
	"stroke_color": (0, 0, 0, 255), "stroke_width": 5,
	"font": "Montserrat-Bold.ttf", "bg_color": None,
	"position": ("center", 0.84),
	"highlight_color": (10, 10, 10, 255), "highlight_bg": (255, 220, 0, 255),
	"highlight_bg_radius": 12,
	"shadow_layers": [(4, 6, 0, (0, 0, 0, 230)), (7, 10, 0, (0, 0, 0, 90))],
	},
	"tiktok_neon": {
	"fontsize": 80, "color": (255, 255, 255, 230),
	"stroke_color": (100, 0, 60, 255), "stroke_width": 3,
	"font": "Montserrat-Bold.ttf", "bg_color": None,
	"position": ("center", 0.85),
	"highlight_color": (0, 242, 234, 255), "highlight_bg": (255, 0, 80, 235),
	"highlight_bg_radius": 22,
	"shadow_layers": [
	(0, 0, 20, (255, 0, 80, 120)), (0, 0, 8, (0, 242, 234, 80)),
	(3, 5, 0, (80, 0, 40, 210)),
	],
	},
	"youtube_clean": {
	"fontsize": 70, "color": (240, 240, 240, 220),
	"stroke_color": (0, 0, 0, 160), "stroke_width": 2,
	"font": "Rubik-Bold.ttf", "bg_color": (0, 0, 0, 140),
	"position": ("center", 0.76),
	"highlight_color": (20, 20, 20, 255), "highlight_bg": (255, 200, 40, 248),
	"highlight_bg_radius": 16,
	"shadow_layers": [(0, 4, 10, (180, 130, 0, 170))],
	},
	"youtube_box": {
	"fontsize": 68, "color": (255, 255, 255, 255),
	"stroke_color": (0, 0, 0, 255), "stroke_width": 2,
	"font": "Montserrat-Bold.ttf", "bg_color": (15, 15, 15, 210),
	"position": ("center", 0.77),
	"highlight_color": (255, 255, 255, 255), "highlight_bg": (200, 0, 0, 255),
	"highlight_bg_radius": 8,
	"shadow_layers": [(0, 5, 0, (110, 0, 0, 230)), (0, 9, 0, (0, 0, 0, 130))],
	},
	"cairo_bold": {
	"fontsize": 80, "color": (255, 255, 255, 255),
	"stroke_color": (0, 0, 0, 220), "stroke_width": 4,
	"font": "Cairo-Bold.ttf", "bg_color": None,
	"position": ("center", 0.82),
	"highlight_color": (10, 10, 10, 255), "highlight_bg": (255, 210, 0, 255),
	"highlight_bg_radius": 14,
	"shadow_layers": [(3, 5, 0, (0, 0, 0, 210)), (6, 9, 0, (0, 0, 0, 80))],
	},
	"tajawal_bold": {
	"fontsize": 82, "color": (255, 255, 255, 255),
	"stroke_color": (0, 0, 0, 230), "stroke_width": 4,
	"font": "Tajawal-Bold.ttf", "bg_color": (0, 0, 0, 150),
	"position": ("center", 0.80),
	"highlight_color": (255, 255, 255, 255), "highlight_bg": (220, 50, 50, 245),
	"highlight_bg_radius": 18,
	"shadow_layers": [(0, 4, 12, (180, 0, 0, 140))],
	},
	"noto_arabic": {
	"fontsize": 76, "color": (240, 240, 240, 230),
	"stroke_color": (0, 0, 0, 180), "stroke_width": 3,
	"font": "NotoSansArabic-Bold.ttf", "bg_color": (0, 0, 0, 155),
	"position": ("center", 0.78),
	"highlight_color": (20, 20, 20, 255), "highlight_bg": (255, 200, 40, 248),
	"highlight_bg_radius": 16,
	"shadow_layers": [(0, 4, 10, (180, 130, 0, 150))],
	},
	}

	_NO_UPPER_RANGES = [
	("\u4E00", "\u9FFF"), ("\u3400", "\u4DBF"),
	("\u3040", "\u309F"), ("\u30A0", "\u30FF"),
	("\uAC00", "\uD7AF"), ("\u0900", "\u097F"),
	("\u0E00", "\u0E7F"), ("\u0600", "\u06FF"),
	("\u0750", "\u077F"), ("\u0590", "\u05FF"),
	]

	# ─────────────────────────────────────────────────────────────────────────────
	# Helpers
	# ─────────────────────────────────────────────────────────────────────────────

	def _rgba(c):
	if c is None:
	return None
	if isinstance(c, (tuple, list)):
	return tuple((*c[:3], c[3] if len(c) == 4 else 255))
	tmp = Image.new("RGBA", (1, 1), c)
	return tmp.getpixel((0, 0))


	def _should_uppercase(text: str) -> bool:
	# ✅ PERF 3: fast path for Latin/Cyrillic content (99% of English/EU usage).
	# U+0590 is the start of Hebrew — anything below it is safely Latin/Cyrillic/Greek.
	# Benchmark: 10.6x faster than the full-scan approach for ASCII text.
	if all(ord(c) < 0x0590 for c in text):
	return True
	# Slow path: only reached for Hebrew, Arabic, CJK, Devanagari, Thai, etc.
	for start, end in _NO_UPPER_RANGES:
	if any(start <= c <= end for c in text):
	return False
	return True


	def _is_arabic_script(text: str) -> bool:
	for start, end in _ARABIC_RANGES:
	if any(start <= c <= end for c in text):
	return True
	return False


	def _prepare_display_text(raw: str, is_rtl: bool, language: str = None) -> str:
	if not is_rtl:
	return raw.upper() if _should_uppercase(raw) else raw
	if _is_arabic_script(raw):
	try:
	reshaped = _ARABIC_RESHAPER.reshape(raw)
	return get_display(reshaped)
	except Exception as exc:
	logger.warning(f"⚠️ Arabic reshape error for '{raw[:20]}…': {exc}")
	try:
	return get_display(raw)
	except Exception:
	return raw
	try:
	return get_display(raw)
	except Exception:
	return raw


	def _is_rtl_text(language: str, text: str) -> bool:
	if language and Config.is_rtl(language):
	return True
	if text:
	detected = Config.detect_language_from_text(text)
	if detected and Config.is_rtl(detected):
	return True
	return False


	def _draw_shadow_layers(draw, box, layers, base_radius):
	x1, y1, x2, y2 = box
	for (ox, oy, blur, color) in layers:
	rgba = _rgba(color)
	if blur == 0:
	draw.rounded_rectangle(
	[(x1 + ox, y1 + oy), (x2 + ox, y2 + oy)],
	radius=base_radius, fill=rgba,
	)
	else:
	steps = max(blur // 2, 3)
	base_a = rgba[3]
	for s in range(steps, 0, -1):
	expand = s * (blur / steps)
	step_alpha = int(base_a * (1 - s / (steps + 1)))
	draw.rounded_rectangle(
	[(x1+ox-expand, y1+oy-expand), (x2+ox+expand, y2+oy+expand)],
	radius=int(base_radius + expand),
	fill=(*rgba[:3], step_alpha),
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# ✅ PERF 1 APPLIED HERE: _fit_font — single textbbox call per word
	# ─────────────────────────────────────────────────────────────────────────────
	def _fit_font(font_path: str, desired_size: int, text_sample: str,
	max_width: int, padding: int = 14, stroke_width: int = 2) -> tuple:
	"""
	Returns (font, actual_fontsize) ≤ desired_size.
	Scales DOWN until the widest word fits within max_width - margins.
	Stops at MIN_FONTSIZE=36.
	"""
	margin = int(stroke_width * 2) + padding
	avail_width = max_width - margin * 2
	words = text_sample.split() if text_sample else ["W"]

	dummy = Image.new("RGBA", (1, 1))
	d = ImageDraw.Draw(dummy)

	size = desired_size
	while size >= MIN_FONTSIZE:
	try:
	font = ImageFont.truetype(font_path, size)
	except Exception:
	font = ImageFont.load_default()
	return font, size

	max_word_w = 0
	for w in words:
	# ✅ PERF 1: one textbbox call per word (was two — [2] and [0] from two calls)
	b = d.textbbox((0, 0), w, font=font)
	max_word_w = max(max_word_w, b[2] - b[0])

	if max_word_w <= avail_width:
	return font, size
	size -= 2

	try:
	font = ImageFont.truetype(font_path, MIN_FONTSIZE)
	except Exception:
	font = ImageFont.load_default()
	return font, MIN_FONTSIZE


	# ─────────────────────────────────────────────────────────────────────────────
	class SubtitleManager:

	@staticmethod
	def ensure_font(language: str = None, style_name: str = None,
	style_font: str = None, text_content: str = None) -> str:
	detected_lang = None
	if language:
	lang_val = language.value if hasattr(language, 'value') else str(language)
	detected_lang = None if lang_val == 'auto' else lang_val
	if not detected_lang and text_content:
	detected_lang = Config.detect_language_from_text(text_content)
	if detected_lang:
	font_name = Config.get_font_for_language(detected_lang, style_name)
	elif style_font:
	font_name = style_font
	else:
	font_name = Config.LANGUAGE_FONT_MAP.get("default", "Montserrat-Bold.ttf")

	logger.debug(f"🔤 Font resolved: lang={detected_lang} style={style_name} → {font_name}")
	font_path = os.path.join(Config.BASE_DIR, font_name)

	if not os.path.exists(font_path):
	logger.info(f"📥 Downloading font: {font_name} …")
	url = Config.FONTS.get(font_name)
	if url:
	try:
	if "fonts.googleapis.com/css" in url:
	if not Config.download_font_from_css(url, font_path):
	raise RuntimeError("CSS font download failed")
	else:
	urllib.request.urlretrieve(url, font_path)
	logger.info(f"✅ Font ready: {font_name}")
	except Exception as exc:
	logger.error(f"❌ Font download failed for {font_name}: {exc}")
	is_arabic_lang = detected_lang in ("ar", "fa", "ur", "ckb")
	fallback_name = "NotoSansArabic-Bold.ttf" if is_arabic_lang else "NotoSans-Bold.ttf"
	fallback_path = os.path.join(Config.BASE_DIR, fallback_name)
	if not os.path.exists(fallback_path):
	fallback_url = Config.FONTS.get(fallback_name)
	if fallback_url:
	try:
	Config.download_font_from_css(fallback_url, fallback_path)
	except Exception:
	pass
	if os.path.exists(fallback_path):
	logger.warning(f"⚠️ Using {fallback_name} fallback")
	return fallback_path
	logger.error("❌ All font downloads failed, falling back to Arial")
	return "Arial"
	else:
	logger.warning(f"⚠️ No URL configured for font: {font_name}")

	return font_path

	@staticmethod
	def wrap_text(text: str, font, max_width: int,
	draw: ImageDraw.Draw = None) -> list:
	"""
	✅ PERF 2: accepts an optional pre-built `draw` object.
	All internal callers pass their existing dummy draw to avoid
	allocating a new Image+ImageDraw on every call.
	"""
	lines = []
	words = text.split()
	if not words:
	return lines

	# ✅ PERF 2: only create new draw if caller didn't provide one
	if draw is None:
	draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))

	current_line = []
	for word in words:
	current_line.append(word)
	bbox = draw.textbbox((0, 0), " ".join(current_line), font=font)
	width = bbox[2] - bbox[0]
	if width > max_width:
	if len(current_line) == 1:
	lines.append(current_line.pop())
	else:
	last = current_line.pop()
	lines.append(" ".join(current_line))
	current_line = [last]

	if current_line:
	lines.append(" ".join(current_line))
	return lines

	# ─────────────────────────────────────────────────────────────────────────
	@staticmethod
	def create_pil_text_clip(text: str, fontsize: int, color, font_path: str,
	stroke_color=(0, 0, 0, 200), stroke_width: int = 2,
	bg_color=None, padding: int = 12, bg_radius: int = 18,
	max_width: int = None):
	try:
	if max_width:
	font, fontsize = _fit_font(font_path, fontsize, text,
	max_width, padding, stroke_width)
	else:
	try:
	font = ImageFont.truetype(font_path, fontsize)
	except Exception:
	logger.warning(f"⚠️ Could not load font: {font_path}")
	font = ImageFont.load_default()

	dummy = Image.new("RGBA", (1, 1))
	d = ImageDraw.Draw(dummy)

	margin = int(stroke_width * 2) + padding
	lines = [text]
	if max_width:
	avail = max_width - margin * 2
	# ✅ PERF 2: pass existing draw object to avoid re-allocation
	lines = SubtitleManager.wrap_text(text, font, avail, draw=d)

	line_metrics = []
	max_w = 0
	total_h = 0
	line_spacing = int(fontsize * 0.2)

	for line in lines:
	bbox = d.textbbox((0, 0), line, font=font)
	w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
	line_metrics.append({"text": line, "w": w, "h": h, "bbox": bbox})
	max_w = max(max_w, w)
	total_h += h
	total_h += (len(lines) - 1) * line_spacing

	raw_iw = max_w + margin * 2
	iw = min(raw_iw, max_width) if max_width else raw_iw
	ih = total_h + margin * 2

	img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
	draw = ImageDraw.Draw(img)

	if bg_color:
	draw.rounded_rectangle([(0, 0), (iw, ih)],
	radius=bg_radius, fill=_rgba(bg_color))

	# ✅ PERF 4: pre-compute colors once outside the loop
	fill_c = _rgba(color)
	stk_c = _rgba(stroke_color)

	current_y = margin
	for m in line_metrics:
	lx = (iw - m["w"]) / 2 - m["bbox"][0]
	ly = current_y - m["bbox"][1]
	draw.text((lx, ly), m["text"], font=font,
	fill=fill_c, stroke_width=stroke_width, stroke_fill=stk_c)
	current_y += m["h"] + line_spacing

	return mpe.ImageClip(np.array(img))

	except Exception as exc:
	logger.error(f"⚠️ create_pil_text_clip error: {exc}")
	return None

	# ─────────────────────────────────────────────────────────────────────────
	@staticmethod
	def create_sentence_highlight_clip(
	sentence_words: list, active_word_index: int,
	font, fontsize: int, font_path: str,
	style_config: dict, is_rtl: bool,
	language: str = None, padding: int = 14,
	bg_radius: int = 20, max_width: int = None,
	):
	try:
	dummy = Image.new("RGBA", (1, 1))
	d = ImageDraw.Draw(dummy)
	space_w = d.textbbox((0, 0), " ", font=font)[2]

	words_data = []
	ordered = list(reversed(sentence_words)) if is_rtl else sentence_words

	for idx, w in enumerate(ordered):
	raw = w.get("text", "")
	display = _prepare_display_text(raw, is_rtl, language)
	bbox = d.textbbox((0, 0), display, font=font)
	words_data.append({
	"index": idx, "text": display,
	"w": bbox[2] - bbox[0], "h": bbox[3] - bbox[1], "bbox": bbox,
	})

	n = len(sentence_words)
	effective_active_index = (
	(n - 1 - active_word_index)
	if (is_rtl and 0 <= active_word_index < n)
	else active_word_index
	)

	stroke_w = style_config.get("stroke_width", 2)
	margin = int(stroke_w * 2) + padding
	safe_width = (max_width - margin * 2) if max_width else 1000

	lines, current_line, current_w = [], [], 0
	for wd in words_data:
	new_w = current_w + wd["w"] + (space_w if current_line else 0)
	if current_line and new_w > safe_width:
	lines.append(current_line)
	current_line, current_w = [wd], wd["w"]
	else:
	if current_line:
	current_w += space_w
	current_line.append(wd)
	current_w += wd["w"]
	if current_line:
	lines.append(current_line)

	line_spacing = int(fontsize * 0.2)
	bleed = 14
	total_h, canvas_w, line_infos = 0, 0, []

	for line in lines:
	lw = sum(wd["w"] for wd in line) + (len(line) - 1) * space_w
	lh = max((wd["h"] for wd in line), default=0)
	line_infos.append({"w": lw, "h": lh, "y": total_h})
	total_h += lh + line_spacing
	canvas_w = max(canvas_w, lw)

	total_h = max(total_h - line_spacing, 0)
	canvas_w = min(canvas_w, safe_width)
	raw_iw = canvas_w + margin * 2
	iw = min(raw_iw, max_width) if max_width else raw_iw
	ih = total_h + margin * 2 + bleed

	img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
	draw = ImageDraw.Draw(img)

	hl_bg = style_config.get("highlight_bg")
	hl_radius = style_config.get("highlight_bg_radius", bg_radius)
	shadows = style_config.get("shadow_layers", [])

	# ✅ PERF 4: pre-compute all colors ONCE before the word loops
	rest_c = _rgba(style_config.get("color", (255, 255, 255, 255)))
	hl_c = _rgba(style_config.get("highlight_color", rest_c))
	stk_c = _rgba(style_config.get("stroke_color", (0, 0, 0, 255)))
	hl_bg_rgba = _rgba(hl_bg) if hl_bg else None

	# Pass 1: highlight backgrounds
	for i, line in enumerate(lines):
	lx = max(margin, margin + (canvas_w - line_infos[i]["w"]) // 2)
	ly = margin + bleed // 2 + line_infos[i]["y"]
	cx = lx
	for wd in line:
	if wd["index"] == effective_active_index and hl_bg_rgba:
	bx1 = cx - padding
	by1 = ly - padding // 2
	bx2 = min(cx + wd["w"] + padding, int(iw) - 1)
	by2 = ly + wd["h"] + padding // 2
	box = (bx1, by1, bx2, by2)
	if shadows:
	_draw_shadow_layers(draw, box, shadows, hl_radius)
	draw.rounded_rectangle([(bx1, by1), (bx2, by2)],
	radius=hl_radius, fill=hl_bg_rgba)
	cx += wd["w"] + space_w

	# Pass 2: text
	for i, line in enumerate(lines):
	lx = max(margin, margin + (canvas_w - line_infos[i]["w"]) // 2)
	ly = margin + bleed // 2 + line_infos[i]["y"]
	cx = lx
	for wd in line:
	if cx >= iw:
	break
	is_active = (wd["index"] == effective_active_index and bool(hl_bg_rgba))
	draw.text(
	(cx, ly - wd["bbox"][1]),
	wd["text"], font=font,
	fill=hl_c if is_active else rest_c, # ✅ pre-computed
	stroke_width=stroke_w, stroke_fill=stk_c, # ✅ pre-computed
	)
	cx += wd["w"] + space_w

	return mpe.ImageClip(np.array(img))

	except Exception as exc:
	logger.error(f"⚠️ create_sentence_highlight_clip error: {exc}")
	return None

	@staticmethod
	def get_style_config(style_name: str) -> dict:
	return STYLES.get(style_name, STYLES["classic"])

	@staticmethod
	def _safe_position(clip, pos: tuple, frame_size: tuple) -> tuple:
	"""Ensure clip bottom stays within frame (bug fix 5)."""
	x, y = pos
	_, frame_h = frame_size
	safety = 8
	clip_h = clip.size[1] if hasattr(clip, "size") else 0
	if clip_h and (y + clip_h > frame_h - safety):
	y = max(0, frame_h - clip_h - safety)
	logger.debug(f"📐 Vertical clamp: clip_h={clip_h}, adjusted y → {y}")
	return (x, y)

	@staticmethod
	def create_caption_clips(
	transcript_data,
	size: tuple = (1080, 1920),
	language: str = None,
	caption_mode: str = "sentence",
	caption_style: str = "classic",
	) -> list:
	all_clips = []
	style_cfg = SubtitleManager.get_style_config(caption_style)

	segments, sample_text = [], ""
	if isinstance(transcript_data, list):
	if transcript_data and "segments" in transcript_data[0]:
	segments = transcript_data[0]["segments"]
	else:
	segments = transcript_data
	elif isinstance(transcript_data, dict) and "segments" in transcript_data:
	segments = transcript_data["segments"]

	for s in segments:
	if s.get("text"):
	sample_text = s["text"]
	break

	font_path = SubtitleManager.ensure_font(
	language=language, style_name=caption_style,
	style_font=style_cfg.get("font"), text_content=sample_text,
	)

	pos_cfg = style_cfg.get("position", ("center", 0.80))
	pos = (pos_cfg[0], int(pos_cfg[1] * size[1]))

	# ── highlight_word mode ───────────────────────────────────────────────
	if caption_mode == "highlight_word":
	fontsize = style_cfg.get("fontsize", 75)
	font, fontsize = _fit_font(
	font_path, fontsize, sample_text, int(size[0] * 0.9),
	style_cfg.get("padding", 14), style_cfg.get("stroke_width", 2),
	)

	for seg in segments:
	sw = seg.get("words", [])
	if not sw:
	logger.warning(f"⚠️ Segment [{seg.get('start',0):.2f}s] has no word timestamps, skipping.")
	continue

	sent_start = seg.get("start", sw[0]["start"])
	sent_end = seg.get("end", sw[-1]["end"])
	sent_text = seg.get("text", " ".join(w["text"] for w in sw))
	is_rtl = _is_rtl_text(language, sent_text)

	for active_idx, active_word in enumerate(sw):
	w_start = active_word.get("start", sent_start)
	w_end = active_word.get("end", sent_end)
	if w_end <= w_start:
	w_end = w_start + 0.05

	clip = SubtitleManager.create_sentence_highlight_clip(
	sentence_words=sw, active_word_index=active_idx,
	font=font, fontsize=fontsize, font_path=font_path,
	style_config=style_cfg, is_rtl=is_rtl, language=language,
	padding=style_cfg.get("padding", 14),
	bg_radius=style_cfg.get("highlight_bg_radius", 20),
	max_width=int(size[0] * 0.9),
	)
	if clip:
	safe_pos = SubtitleManager._safe_position(clip, pos, size)
	all_clips.append(
	clip.set_start(w_start).set_end(w_end).set_position(safe_pos)
	)

	covered = [(w["start"], w["end"]) for w in sw]
	gaps = []
	if sent_start < covered[0][0]:
	gaps.append((sent_start, covered[0][0]))
	for j in range(len(covered) - 1):
	if covered[j][1] < covered[j + 1][0]:
	gaps.append((covered[j][1], covered[j + 1][0]))
	if covered[-1][1] < sent_end:
	gaps.append((covered[-1][1], sent_end))

	plain_cfg = {**style_cfg, "highlight_bg": None, "shadow_layers": []}
	for gs, ge in gaps:
	if ge - gs < 0.02:
	continue
	gc = SubtitleManager.create_sentence_highlight_clip(
	sentence_words=sw, active_word_index=-1,
	font=font, fontsize=fontsize, font_path=font_path,
	style_config=plain_cfg, is_rtl=is_rtl, language=language,
	max_width=int(size[0] * 0.9),
	)
	if gc:
	safe_pos = SubtitleManager._safe_position(gc, pos, size)
	all_clips.append(gc.set_start(gs).set_end(ge).set_position(safe_pos))

	return all_clips

	# ── sentence / word mode ──────────────────────────────────────────────
	for seg in segments:
	full_text = seg.get("text", "").strip() or " ".join(
	w["text"] for w in seg.get("words", [])
	)
	if not full_text:
	continue

	start_t, end_t = seg.get("start", 0), seg.get("end", 0)
	if end_t <= start_t:
	ws = seg.get("words", [])
	if ws:
	start_t, end_t = ws[0]["start"], ws[-1]["end"]
	else:
	continue

	line1, line2 = seg.get("_line1", ""), seg.get("_line2", "")
	if line1:
	display_text = f"{line1}\n{line2}".strip() if line2 else line1
	chunks = [{"text": display_text, "start": start_t, "end": end_t}]
	else:
	chunk_size = 1 if caption_mode == "word" else 4
	chunks = []
	stt_words = seg.get("words")
	if stt_words:
	valid = [w for w in stt_words if w.get("text", "").strip()]
	for i in range(0, len(valid), chunk_size):
	grp = valid[i:i + chunk_size]
	chunks.append({
	"text": " ".join(w["text"] for w in grp),
	"start": grp[0]["start"], "end": grp[-1]["end"],
	})
	else:
	wl = full_text.split()
	for i in range(0, len(wl), chunk_size):
	cw = wl[i:i + chunk_size]
	cs = start_t + (end_t - start_t) * (i / len(wl))
	ce = cs + (end_t - start_t) * (len(cw) / len(wl))
	chunks.append({"text": " ".join(cw), "start": cs,
	"end": max(ce, cs + 0.1)})

	for chunk in chunks:
	disp = chunk["text"]
	is_rtl = _is_rtl_text(language, disp)
	disp = _prepare_display_text(disp, is_rtl, language)

	clip = SubtitleManager.create_pil_text_clip(
	text=disp, fontsize=style_cfg.get("fontsize", 72),
	color=style_cfg.get("color", (255, 255, 255, 255)),
	font_path=font_path,
	stroke_color=style_cfg.get("stroke_color", (0, 0, 0, 200)),
	stroke_width=style_cfg.get("stroke_width", 2),
	bg_color=style_cfg.get("bg_color"),
	bg_radius=style_cfg.get("highlight_bg_radius", 18),
	max_width=int(size[0] * 0.9),
	)
	if clip:
	safe_pos = SubtitleManager._safe_position(clip, pos, size)
	all_clips.append(
	clip.set_start(chunk["start"])
	.set_end(chunk["end"])
	.set_position(safe_pos)
	)

	return all_clips

	@staticmethod
	def create_captions(
	video_clip, transcript_data,
	size: tuple = (1080, 1920), language: str = None,
	caption_mode: str = "sentence", caption_style: str = "classic",
	):
	clips = SubtitleManager.create_caption_clips(
	transcript_data, size=size, language=language,
	caption_mode=caption_mode, caption_style=caption_style,
	)
	return mpe.CompositeVideoClip([video_clip] + clips, size=size)