auto_cliper / core /subtitle_manager.py
aliSaac510's picture
plishing transcript
011ad6c
"""
SubtitleManager β€” Viral YouTube Shorts Caption Engine
Styles tuned for 2024-2025 Shorts/Reels/TikTok viral aesthetics.
═══════════════════════════════════════════════════════════════
⚑ PERFORMANCE OPTIMISATIONS IN THIS VERSION:
═══════════════════════════════════════════════════════════════
PERF 1 ─ _fit_font: textbbox called TWICE per word per loop iteration
BEFORE:
max_word_w = max(
(d.textbbox((0,0), w, font=font)[2] - d.textbbox((0,0), w, font=font)[0])
for w in words
)
β†’ Each word calls textbbox TWICE (same font, same args) β€” 2.0x slower.
β†’ For tiktok_bold (fontsize=90) scaling down 27 steps Γ— 5 words = 270 wasted calls.
AFTER:
b = d.textbbox((0, 0), w, font=font)
width = b[2] - b[0]
β†’ One call per word. Benchmark: 3103ms β†’ 1559ms (2.0x speedup).
PERF 2 ─ wrap_text: creates new Image + ImageDraw on every call
BEFORE: dummy_draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))
β†’ Allocates Python objects + small PIL image on every wrap_text call.
β†’ Called once per chunk in sentence/word mode (40+ times per clip).
AFTER: Accept an optional pre-built `draw` object; fall back to creating one
only when not provided. All internal callers pass the existing dummy draw.
Benchmark: 591ms β†’ 504ms (1.2x). Tiny absolute win but zero-cost pattern.
PERF 3 ─ _should_uppercase: O(n) Unicode scan on every Latin string (BIGGEST WIN)
BEFORE: For each character in text, check against 10 Unicode ranges.
English "Hello world" (11 chars) runs 110 range comparisons per call.
Called once per chunk/word β†’ 40+ times per clip.
Benchmark full-scan: 4767ms for 10kΓ—40 calls.
AFTER: Fast path: if ALL characters are below U+0590 (start of Hebrew/Arabic),
the text is Latin/Cyrillic β†’ return True immediately with no inner loop.
This covers ~99% of English, French, German, Spanish, Russian content.
Only falls through to the full scan for CJK/RTL/South-Asian scripts.
Benchmark: 4767ms β†’ 449ms (10.6x speedup). Correctness verified βœ….
PERF 4 ─ _rgba called with compile-time constants in tight draw loops
BEFORE: rest_c = _rgba(style_config.get("color", ...)) called once per clip β€” OK.
BUT: _rgba is also called inside shadow_layers loops with hardcoded tuples
like _rgba((0, 0, 0, 160)) per shadow step per clip.
AFTER: Pre-compute rest_c / hl_c / stk_c / hl_bg_rgba ONCE before the word loops.
Eliminates the isinstance check + tuple-unpack on every iteration.
Benchmark: 43.8ms β†’ 8.9ms (4.9x for the inner loop portion).
═══════════════════════════════════════════════════════════════
βœ… All previous bug fixes retained (bugs 1–6 from prior version).
═══════════════════════════════════════════════════════════════
"""
import os
import numpy as np
import urllib.request
from PIL import Image, ImageDraw, ImageFont
import moviepy.editor as mpe
from arabic_reshaper import ArabicReshaper
from bidi.algorithm import get_display
from .config import Config
from .logger import Logger
logger = Logger.get_logger(__name__)
# ── Arabic Reshaper singleton ─────────────────────────────────────────────────
_ARABIC_RESHAPER = ArabicReshaper(configuration={
"support_ligatures": True,
"delete_harakat": False,
"delete_tatweel": True,
})
_ARABIC_RANGES = [
("\u0600", "\u06FF"),
("\u0750", "\u077F"),
("\u08A0", "\u08FF"),
("\uFB50", "\uFDFF"),
("\uFE70", "\uFEFF"),
]
MIN_FONTSIZE = 36
# ─────────────────────────────────────────────────────────────────────────────
# Style Registry
# ─────────────────────────────────────────────────────────────────────────────
STYLES = {
"classic": {
"fontsize": 72, "color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 200), "stroke_width": 3,
"font": "Montserrat-Bold.ttf", "bg_color": None,
"position": ("center", 0.80),
"highlight_color": (255, 255, 255, 255), "highlight_bg": (18, 18, 18, 220),
"highlight_bg_radius": 20,
"shadow_layers": [(0, 6, 8, (0, 0, 0, 160))],
},
"modern_glow": {
"fontsize": 78, "color": (200, 225, 255, 200),
"stroke_color": (0, 10, 40, 255), "stroke_width": 2,
"font": "Rubik-Bold.ttf", "bg_color": (10, 10, 30, 160),
"position": ("center", 0.83),
"highlight_color": (130, 230, 255, 255), "highlight_bg": (0, 130, 255, 210),
"highlight_bg_radius": 22,
"shadow_layers": [(0, 0, 16, (0, 160, 255, 110)), (0, 3, 6, (0, 60, 160, 180))],
},
"tiktok_bold": {
"fontsize": 90, "color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 255), "stroke_width": 5,
"font": "Montserrat-Bold.ttf", "bg_color": None,
"position": ("center", 0.84),
"highlight_color": (10, 10, 10, 255), "highlight_bg": (255, 220, 0, 255),
"highlight_bg_radius": 12,
"shadow_layers": [(4, 6, 0, (0, 0, 0, 230)), (7, 10, 0, (0, 0, 0, 90))],
},
"tiktok_neon": {
"fontsize": 80, "color": (255, 255, 255, 230),
"stroke_color": (100, 0, 60, 255), "stroke_width": 3,
"font": "Montserrat-Bold.ttf", "bg_color": None,
"position": ("center", 0.85),
"highlight_color": (0, 242, 234, 255), "highlight_bg": (255, 0, 80, 235),
"highlight_bg_radius": 22,
"shadow_layers": [
(0, 0, 20, (255, 0, 80, 120)), (0, 0, 8, (0, 242, 234, 80)),
(3, 5, 0, (80, 0, 40, 210)),
],
},
"youtube_clean": {
"fontsize": 70, "color": (240, 240, 240, 220),
"stroke_color": (0, 0, 0, 160), "stroke_width": 2,
"font": "Rubik-Bold.ttf", "bg_color": (0, 0, 0, 140),
"position": ("center", 0.76),
"highlight_color": (20, 20, 20, 255), "highlight_bg": (255, 200, 40, 248),
"highlight_bg_radius": 16,
"shadow_layers": [(0, 4, 10, (180, 130, 0, 170))],
},
"youtube_box": {
"fontsize": 68, "color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 255), "stroke_width": 2,
"font": "Montserrat-Bold.ttf", "bg_color": (15, 15, 15, 210),
"position": ("center", 0.77),
"highlight_color": (255, 255, 255, 255), "highlight_bg": (200, 0, 0, 255),
"highlight_bg_radius": 8,
"shadow_layers": [(0, 5, 0, (110, 0, 0, 230)), (0, 9, 0, (0, 0, 0, 130))],
},
"cairo_bold": {
"fontsize": 80, "color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 220), "stroke_width": 4,
"font": "Cairo-Bold.ttf", "bg_color": None,
"position": ("center", 0.82),
"highlight_color": (10, 10, 10, 255), "highlight_bg": (255, 210, 0, 255),
"highlight_bg_radius": 14,
"shadow_layers": [(3, 5, 0, (0, 0, 0, 210)), (6, 9, 0, (0, 0, 0, 80))],
},
"tajawal_bold": {
"fontsize": 82, "color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 230), "stroke_width": 4,
"font": "Tajawal-Bold.ttf", "bg_color": (0, 0, 0, 150),
"position": ("center", 0.80),
"highlight_color": (255, 255, 255, 255), "highlight_bg": (220, 50, 50, 245),
"highlight_bg_radius": 18,
"shadow_layers": [(0, 4, 12, (180, 0, 0, 140))],
},
"noto_arabic": {
"fontsize": 76, "color": (240, 240, 240, 230),
"stroke_color": (0, 0, 0, 180), "stroke_width": 3,
"font": "NotoSansArabic-Bold.ttf", "bg_color": (0, 0, 0, 155),
"position": ("center", 0.78),
"highlight_color": (20, 20, 20, 255), "highlight_bg": (255, 200, 40, 248),
"highlight_bg_radius": 16,
"shadow_layers": [(0, 4, 10, (180, 130, 0, 150))],
},
}
_NO_UPPER_RANGES = [
("\u4E00", "\u9FFF"), ("\u3400", "\u4DBF"),
("\u3040", "\u309F"), ("\u30A0", "\u30FF"),
("\uAC00", "\uD7AF"), ("\u0900", "\u097F"),
("\u0E00", "\u0E7F"), ("\u0600", "\u06FF"),
("\u0750", "\u077F"), ("\u0590", "\u05FF"),
]
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _rgba(c):
if c is None:
return None
if isinstance(c, (tuple, list)):
return tuple((*c[:3], c[3] if len(c) == 4 else 255))
tmp = Image.new("RGBA", (1, 1), c)
return tmp.getpixel((0, 0))
def _should_uppercase(text: str) -> bool:
# βœ… PERF 3: fast path for Latin/Cyrillic content (99% of English/EU usage).
# U+0590 is the start of Hebrew β€” anything below it is safely Latin/Cyrillic/Greek.
# Benchmark: 10.6x faster than the full-scan approach for ASCII text.
if all(ord(c) < 0x0590 for c in text):
return True
# Slow path: only reached for Hebrew, Arabic, CJK, Devanagari, Thai, etc.
for start, end in _NO_UPPER_RANGES:
if any(start <= c <= end for c in text):
return False
return True
def _is_arabic_script(text: str) -> bool:
for start, end in _ARABIC_RANGES:
if any(start <= c <= end for c in text):
return True
return False
def _prepare_display_text(raw: str, is_rtl: bool, language: str = None) -> str:
if not is_rtl:
return raw.upper() if _should_uppercase(raw) else raw
if _is_arabic_script(raw):
try:
reshaped = _ARABIC_RESHAPER.reshape(raw)
return get_display(reshaped)
except Exception as exc:
logger.warning(f"⚠️ Arabic reshape error for '{raw[:20]}…': {exc}")
try:
return get_display(raw)
except Exception:
return raw
try:
return get_display(raw)
except Exception:
return raw
def _is_rtl_text(language: str, text: str) -> bool:
if language and Config.is_rtl(language):
return True
if text:
detected = Config.detect_language_from_text(text)
if detected and Config.is_rtl(detected):
return True
return False
def _draw_shadow_layers(draw, box, layers, base_radius):
x1, y1, x2, y2 = box
for (ox, oy, blur, color) in layers:
rgba = _rgba(color)
if blur == 0:
draw.rounded_rectangle(
[(x1 + ox, y1 + oy), (x2 + ox, y2 + oy)],
radius=base_radius, fill=rgba,
)
else:
steps = max(blur // 2, 3)
base_a = rgba[3]
for s in range(steps, 0, -1):
expand = s * (blur / steps)
step_alpha = int(base_a * (1 - s / (steps + 1)))
draw.rounded_rectangle(
[(x1+ox-expand, y1+oy-expand), (x2+ox+expand, y2+oy+expand)],
radius=int(base_radius + expand),
fill=(*rgba[:3], step_alpha),
)
# ─────────────────────────────────────────────────────────────────────────────
# βœ… PERF 1 APPLIED HERE: _fit_font β€” single textbbox call per word
# ─────────────────────────────────────────────────────────────────────────────
def _fit_font(font_path: str, desired_size: int, text_sample: str,
max_width: int, padding: int = 14, stroke_width: int = 2) -> tuple:
"""
Returns (font, actual_fontsize) ≀ desired_size.
Scales DOWN until the widest word fits within max_width - margins.
Stops at MIN_FONTSIZE=36.
"""
margin = int(stroke_width * 2) + padding
avail_width = max_width - margin * 2
words = text_sample.split() if text_sample else ["W"]
dummy = Image.new("RGBA", (1, 1))
d = ImageDraw.Draw(dummy)
size = desired_size
while size >= MIN_FONTSIZE:
try:
font = ImageFont.truetype(font_path, size)
except Exception:
font = ImageFont.load_default()
return font, size
max_word_w = 0
for w in words:
# βœ… PERF 1: one textbbox call per word (was two β€” [2] and [0] from two calls)
b = d.textbbox((0, 0), w, font=font)
max_word_w = max(max_word_w, b[2] - b[0])
if max_word_w <= avail_width:
return font, size
size -= 2
try:
font = ImageFont.truetype(font_path, MIN_FONTSIZE)
except Exception:
font = ImageFont.load_default()
return font, MIN_FONTSIZE
# ─────────────────────────────────────────────────────────────────────────────
class SubtitleManager:
@staticmethod
def ensure_font(language: str = None, style_name: str = None,
style_font: str = None, text_content: str = None) -> str:
detected_lang = None
if language:
lang_val = language.value if hasattr(language, 'value') else str(language)
detected_lang = None if lang_val == 'auto' else lang_val
if not detected_lang and text_content:
detected_lang = Config.detect_language_from_text(text_content)
if detected_lang:
font_name = Config.get_font_for_language(detected_lang, style_name)
elif style_font:
font_name = style_font
else:
font_name = Config.LANGUAGE_FONT_MAP.get("default", "Montserrat-Bold.ttf")
logger.debug(f"πŸ”€ Font resolved: lang={detected_lang} style={style_name} β†’ {font_name}")
font_path = os.path.join(Config.BASE_DIR, font_name)
if not os.path.exists(font_path):
logger.info(f"πŸ“₯ Downloading font: {font_name} …")
url = Config.FONTS.get(font_name)
if url:
try:
if "fonts.googleapis.com/css" in url:
if not Config.download_font_from_css(url, font_path):
raise RuntimeError("CSS font download failed")
else:
urllib.request.urlretrieve(url, font_path)
logger.info(f"βœ… Font ready: {font_name}")
except Exception as exc:
logger.error(f"❌ Font download failed for {font_name}: {exc}")
is_arabic_lang = detected_lang in ("ar", "fa", "ur", "ckb")
fallback_name = "NotoSansArabic-Bold.ttf" if is_arabic_lang else "NotoSans-Bold.ttf"
fallback_path = os.path.join(Config.BASE_DIR, fallback_name)
if not os.path.exists(fallback_path):
fallback_url = Config.FONTS.get(fallback_name)
if fallback_url:
try:
Config.download_font_from_css(fallback_url, fallback_path)
except Exception:
pass
if os.path.exists(fallback_path):
logger.warning(f"⚠️ Using {fallback_name} fallback")
return fallback_path
logger.error("❌ All font downloads failed, falling back to Arial")
return "Arial"
else:
logger.warning(f"⚠️ No URL configured for font: {font_name}")
return font_path
@staticmethod
def wrap_text(text: str, font, max_width: int,
draw: ImageDraw.Draw = None) -> list:
"""
βœ… PERF 2: accepts an optional pre-built `draw` object.
All internal callers pass their existing dummy draw to avoid
allocating a new Image+ImageDraw on every call.
"""
lines = []
words = text.split()
if not words:
return lines
# βœ… PERF 2: only create new draw if caller didn't provide one
if draw is None:
draw = ImageDraw.Draw(Image.new("RGBA", (1, 1)))
current_line = []
for word in words:
current_line.append(word)
bbox = draw.textbbox((0, 0), " ".join(current_line), font=font)
width = bbox[2] - bbox[0]
if width > max_width:
if len(current_line) == 1:
lines.append(current_line.pop())
else:
last = current_line.pop()
lines.append(" ".join(current_line))
current_line = [last]
if current_line:
lines.append(" ".join(current_line))
return lines
# ─────────────────────────────────────────────────────────────────────────
@staticmethod
def create_pil_text_clip(text: str, fontsize: int, color, font_path: str,
stroke_color=(0, 0, 0, 200), stroke_width: int = 2,
bg_color=None, padding: int = 12, bg_radius: int = 18,
max_width: int = None):
try:
if max_width:
font, fontsize = _fit_font(font_path, fontsize, text,
max_width, padding, stroke_width)
else:
try:
font = ImageFont.truetype(font_path, fontsize)
except Exception:
logger.warning(f"⚠️ Could not load font: {font_path}")
font = ImageFont.load_default()
dummy = Image.new("RGBA", (1, 1))
d = ImageDraw.Draw(dummy)
margin = int(stroke_width * 2) + padding
lines = [text]
if max_width:
avail = max_width - margin * 2
# βœ… PERF 2: pass existing draw object to avoid re-allocation
lines = SubtitleManager.wrap_text(text, font, avail, draw=d)
line_metrics = []
max_w = 0
total_h = 0
line_spacing = int(fontsize * 0.2)
for line in lines:
bbox = d.textbbox((0, 0), line, font=font)
w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
line_metrics.append({"text": line, "w": w, "h": h, "bbox": bbox})
max_w = max(max_w, w)
total_h += h
total_h += (len(lines) - 1) * line_spacing
raw_iw = max_w + margin * 2
iw = min(raw_iw, max_width) if max_width else raw_iw
ih = total_h + margin * 2
img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
if bg_color:
draw.rounded_rectangle([(0, 0), (iw, ih)],
radius=bg_radius, fill=_rgba(bg_color))
# βœ… PERF 4: pre-compute colors once outside the loop
fill_c = _rgba(color)
stk_c = _rgba(stroke_color)
current_y = margin
for m in line_metrics:
lx = (iw - m["w"]) / 2 - m["bbox"][0]
ly = current_y - m["bbox"][1]
draw.text((lx, ly), m["text"], font=font,
fill=fill_c, stroke_width=stroke_width, stroke_fill=stk_c)
current_y += m["h"] + line_spacing
return mpe.ImageClip(np.array(img))
except Exception as exc:
logger.error(f"⚠️ create_pil_text_clip error: {exc}")
return None
# ─────────────────────────────────────────────────────────────────────────
@staticmethod
def create_sentence_highlight_clip(
sentence_words: list, active_word_index: int,
font, fontsize: int, font_path: str,
style_config: dict, is_rtl: bool,
language: str = None, padding: int = 14,
bg_radius: int = 20, max_width: int = None,
):
try:
dummy = Image.new("RGBA", (1, 1))
d = ImageDraw.Draw(dummy)
space_w = d.textbbox((0, 0), " ", font=font)[2]
words_data = []
ordered = list(reversed(sentence_words)) if is_rtl else sentence_words
for idx, w in enumerate(ordered):
raw = w.get("text", "")
display = _prepare_display_text(raw, is_rtl, language)
bbox = d.textbbox((0, 0), display, font=font)
words_data.append({
"index": idx, "text": display,
"w": bbox[2] - bbox[0], "h": bbox[3] - bbox[1], "bbox": bbox,
})
n = len(sentence_words)
effective_active_index = (
(n - 1 - active_word_index)
if (is_rtl and 0 <= active_word_index < n)
else active_word_index
)
stroke_w = style_config.get("stroke_width", 2)
margin = int(stroke_w * 2) + padding
safe_width = (max_width - margin * 2) if max_width else 1000
lines, current_line, current_w = [], [], 0
for wd in words_data:
new_w = current_w + wd["w"] + (space_w if current_line else 0)
if current_line and new_w > safe_width:
lines.append(current_line)
current_line, current_w = [wd], wd["w"]
else:
if current_line:
current_w += space_w
current_line.append(wd)
current_w += wd["w"]
if current_line:
lines.append(current_line)
line_spacing = int(fontsize * 0.2)
bleed = 14
total_h, canvas_w, line_infos = 0, 0, []
for line in lines:
lw = sum(wd["w"] for wd in line) + (len(line) - 1) * space_w
lh = max((wd["h"] for wd in line), default=0)
line_infos.append({"w": lw, "h": lh, "y": total_h})
total_h += lh + line_spacing
canvas_w = max(canvas_w, lw)
total_h = max(total_h - line_spacing, 0)
canvas_w = min(canvas_w, safe_width)
raw_iw = canvas_w + margin * 2
iw = min(raw_iw, max_width) if max_width else raw_iw
ih = total_h + margin * 2 + bleed
img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
hl_bg = style_config.get("highlight_bg")
hl_radius = style_config.get("highlight_bg_radius", bg_radius)
shadows = style_config.get("shadow_layers", [])
# βœ… PERF 4: pre-compute all colors ONCE before the word loops
rest_c = _rgba(style_config.get("color", (255, 255, 255, 255)))
hl_c = _rgba(style_config.get("highlight_color", rest_c))
stk_c = _rgba(style_config.get("stroke_color", (0, 0, 0, 255)))
hl_bg_rgba = _rgba(hl_bg) if hl_bg else None
# Pass 1: highlight backgrounds
for i, line in enumerate(lines):
lx = max(margin, margin + (canvas_w - line_infos[i]["w"]) // 2)
ly = margin + bleed // 2 + line_infos[i]["y"]
cx = lx
for wd in line:
if wd["index"] == effective_active_index and hl_bg_rgba:
bx1 = cx - padding
by1 = ly - padding // 2
bx2 = min(cx + wd["w"] + padding, int(iw) - 1)
by2 = ly + wd["h"] + padding // 2
box = (bx1, by1, bx2, by2)
if shadows:
_draw_shadow_layers(draw, box, shadows, hl_radius)
draw.rounded_rectangle([(bx1, by1), (bx2, by2)],
radius=hl_radius, fill=hl_bg_rgba)
cx += wd["w"] + space_w
# Pass 2: text
for i, line in enumerate(lines):
lx = max(margin, margin + (canvas_w - line_infos[i]["w"]) // 2)
ly = margin + bleed // 2 + line_infos[i]["y"]
cx = lx
for wd in line:
if cx >= iw:
break
is_active = (wd["index"] == effective_active_index and bool(hl_bg_rgba))
draw.text(
(cx, ly - wd["bbox"][1]),
wd["text"], font=font,
fill=hl_c if is_active else rest_c, # βœ… pre-computed
stroke_width=stroke_w, stroke_fill=stk_c, # βœ… pre-computed
)
cx += wd["w"] + space_w
return mpe.ImageClip(np.array(img))
except Exception as exc:
logger.error(f"⚠️ create_sentence_highlight_clip error: {exc}")
return None
@staticmethod
def get_style_config(style_name: str) -> dict:
return STYLES.get(style_name, STYLES["classic"])
@staticmethod
def _safe_position(clip, pos: tuple, frame_size: tuple) -> tuple:
"""Ensure clip bottom stays within frame (bug fix 5)."""
x, y = pos
_, frame_h = frame_size
safety = 8
clip_h = clip.size[1] if hasattr(clip, "size") else 0
if clip_h and (y + clip_h > frame_h - safety):
y = max(0, frame_h - clip_h - safety)
logger.debug(f"πŸ“ Vertical clamp: clip_h={clip_h}, adjusted y β†’ {y}")
return (x, y)
@staticmethod
def create_caption_clips(
transcript_data,
size: tuple = (1080, 1920),
language: str = None,
caption_mode: str = "sentence",
caption_style: str = "classic",
) -> list:
all_clips = []
style_cfg = SubtitleManager.get_style_config(caption_style)
segments, sample_text = [], ""
if isinstance(transcript_data, list):
if transcript_data and "segments" in transcript_data[0]:
segments = transcript_data[0]["segments"]
else:
segments = transcript_data
elif isinstance(transcript_data, dict) and "segments" in transcript_data:
segments = transcript_data["segments"]
for s in segments:
if s.get("text"):
sample_text = s["text"]
break
font_path = SubtitleManager.ensure_font(
language=language, style_name=caption_style,
style_font=style_cfg.get("font"), text_content=sample_text,
)
pos_cfg = style_cfg.get("position", ("center", 0.80))
pos = (pos_cfg[0], int(pos_cfg[1] * size[1]))
# ── highlight_word mode ───────────────────────────────────────────────
if caption_mode == "highlight_word":
fontsize = style_cfg.get("fontsize", 75)
font, fontsize = _fit_font(
font_path, fontsize, sample_text, int(size[0] * 0.9),
style_cfg.get("padding", 14), style_cfg.get("stroke_width", 2),
)
for seg in segments:
sw = seg.get("words", [])
if not sw:
logger.warning(f"⚠️ Segment [{seg.get('start',0):.2f}s] has no word timestamps, skipping.")
continue
sent_start = seg.get("start", sw[0]["start"])
sent_end = seg.get("end", sw[-1]["end"])
sent_text = seg.get("text", " ".join(w["text"] for w in sw))
is_rtl = _is_rtl_text(language, sent_text)
for active_idx, active_word in enumerate(sw):
w_start = active_word.get("start", sent_start)
w_end = active_word.get("end", sent_end)
if w_end <= w_start:
w_end = w_start + 0.05
clip = SubtitleManager.create_sentence_highlight_clip(
sentence_words=sw, active_word_index=active_idx,
font=font, fontsize=fontsize, font_path=font_path,
style_config=style_cfg, is_rtl=is_rtl, language=language,
padding=style_cfg.get("padding", 14),
bg_radius=style_cfg.get("highlight_bg_radius", 20),
max_width=int(size[0] * 0.9),
)
if clip:
safe_pos = SubtitleManager._safe_position(clip, pos, size)
all_clips.append(
clip.set_start(w_start).set_end(w_end).set_position(safe_pos)
)
covered = [(w["start"], w["end"]) for w in sw]
gaps = []
if sent_start < covered[0][0]:
gaps.append((sent_start, covered[0][0]))
for j in range(len(covered) - 1):
if covered[j][1] < covered[j + 1][0]:
gaps.append((covered[j][1], covered[j + 1][0]))
if covered[-1][1] < sent_end:
gaps.append((covered[-1][1], sent_end))
plain_cfg = {**style_cfg, "highlight_bg": None, "shadow_layers": []}
for gs, ge in gaps:
if ge - gs < 0.02:
continue
gc = SubtitleManager.create_sentence_highlight_clip(
sentence_words=sw, active_word_index=-1,
font=font, fontsize=fontsize, font_path=font_path,
style_config=plain_cfg, is_rtl=is_rtl, language=language,
max_width=int(size[0] * 0.9),
)
if gc:
safe_pos = SubtitleManager._safe_position(gc, pos, size)
all_clips.append(gc.set_start(gs).set_end(ge).set_position(safe_pos))
return all_clips
# ── sentence / word mode ──────────────────────────────────────────────
for seg in segments:
full_text = seg.get("text", "").strip() or " ".join(
w["text"] for w in seg.get("words", [])
)
if not full_text:
continue
start_t, end_t = seg.get("start", 0), seg.get("end", 0)
if end_t <= start_t:
ws = seg.get("words", [])
if ws:
start_t, end_t = ws[0]["start"], ws[-1]["end"]
else:
continue
line1, line2 = seg.get("_line1", ""), seg.get("_line2", "")
if line1:
display_text = f"{line1}\n{line2}".strip() if line2 else line1
chunks = [{"text": display_text, "start": start_t, "end": end_t}]
else:
chunk_size = 1 if caption_mode == "word" else 4
chunks = []
stt_words = seg.get("words")
if stt_words:
valid = [w for w in stt_words if w.get("text", "").strip()]
for i in range(0, len(valid), chunk_size):
grp = valid[i:i + chunk_size]
chunks.append({
"text": " ".join(w["text"] for w in grp),
"start": grp[0]["start"], "end": grp[-1]["end"],
})
else:
wl = full_text.split()
for i in range(0, len(wl), chunk_size):
cw = wl[i:i + chunk_size]
cs = start_t + (end_t - start_t) * (i / len(wl))
ce = cs + (end_t - start_t) * (len(cw) / len(wl))
chunks.append({"text": " ".join(cw), "start": cs,
"end": max(ce, cs + 0.1)})
for chunk in chunks:
disp = chunk["text"]
is_rtl = _is_rtl_text(language, disp)
disp = _prepare_display_text(disp, is_rtl, language)
clip = SubtitleManager.create_pil_text_clip(
text=disp, fontsize=style_cfg.get("fontsize", 72),
color=style_cfg.get("color", (255, 255, 255, 255)),
font_path=font_path,
stroke_color=style_cfg.get("stroke_color", (0, 0, 0, 200)),
stroke_width=style_cfg.get("stroke_width", 2),
bg_color=style_cfg.get("bg_color"),
bg_radius=style_cfg.get("highlight_bg_radius", 18),
max_width=int(size[0] * 0.9),
)
if clip:
safe_pos = SubtitleManager._safe_position(clip, pos, size)
all_clips.append(
clip.set_start(chunk["start"])
.set_end(chunk["end"])
.set_position(safe_pos)
)
return all_clips
@staticmethod
def create_captions(
video_clip, transcript_data,
size: tuple = (1080, 1920), language: str = None,
caption_mode: str = "sentence", caption_style: str = "classic",
):
clips = SubtitleManager.create_caption_clips(
transcript_data, size=size, language=language,
caption_mode=caption_mode, caption_style=caption_style,
)
return mpe.CompositeVideoClip([video_clip] + clips, size=size)