Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

ElevenClip-AI / backend /src /processing /subtitle.py

JakgritB

Deploy safe hackathon demo

102f4d2 about 2 months ago

10.3 kB

	"""Generate ASS subtitles using pysubs2.

	Supports: word-by-word, sentence, karaoke, fade, pop, typewriter animations.
	Full ASS spec: font, size, 4-color layers, border, shadow, position, alignment.
	Handles Thai/Chinese character-level splitting.
	"""
	from pathlib import Path
	from typing import Optional
	import pysubs2
	from pysubs2 import SSAFile, SSAEvent, SSAStyle
	from loguru import logger

	# Languages that split by character rather than word
	CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo"}

	# Default font per language
	DEFAULT_FONTS = {
	"th": "Noto Sans Thai",
	"zh": "Noto Sans SC",
	"zh-tw": "Noto Sans TC",
	"ja": "Noto Sans JP",
	"ko": "Noto Sans KR",
	"en": "Montserrat",
	"default": "Noto Sans",
	}

	# Animation presets (ASS override tags)
	def _fade_tags(fade_in_ms: int = 200, fade_out_ms: int = 200) -> str:
	return f"{{\\fade({fade_in_ms},{fade_out_ms})}}"

	def _pop_tags() -> str:
	return "{\\t(0,100,\\fscx120\\fscy120)\\t(100,200,\\fscx100\\fscy100)}"

	def _typewriter_per_char(char: str, delay_ms: int) -> str:
	return f"{{\\alpha&HFF&\\t({delay_ms},{delay_ms+80},\\alpha&H00&)}}{char}"

	def _bounce_tags() -> str:
	return "{\\t(0,150,\\frz-5)\\t(150,300,\\frz5)\\t(300,400,\\frz0)}"


	def _color_to_ass(hex_color: str, alpha: int = 0) -> str:
	"""Convert #RRGGBB hex to ASS &HAABBGGRR format."""
	hex_color = hex_color.lstrip("#")
	if len(hex_color) == 6:
	r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
	else:
	r, g, b = "FF", "FF", "FF"
	aa = f"{alpha:02X}"
	return f"&H{aa}{b}{g}{r}"


	def build_style(
	font_family: str = "Noto Sans",
	font_size: int = 72,
	primary_color: str = "#FFFFFF",
	secondary_color: str = "#FFFF00",
	outline_color: str = "#000000",
	shadow_color: str = "#000000",
	primary_alpha: int = 0,
	outline_alpha: int = 0,
	shadow_alpha: int = 80,
	bold: bool = True,
	italic: bool = False,
	underline: bool = False,
	outline_size: float = 4.0,
	shadow_size: float = 2.0,
	alignment: int = 2, # 2=bottom-center, 8=top-center
	margin_l: int = 40,
	margin_r: int = 40,
	margin_v: int = 250,
	scale_x: int = 100,
	scale_y: int = 100,
	spacing: float = 0.0,
	angle: float = 0.0,
	) -> SSAStyle:
	style = SSAStyle()
	style.fontname = font_family
	style.fontsize = font_size
	style.primarycolor = pysubs2.Color(*_hex_to_rgba(primary_color, primary_alpha))
	style.secondarycolor = pysubs2.Color(*_hex_to_rgba(secondary_color, 0))
	style.outlinecolor = pysubs2.Color(*_hex_to_rgba(outline_color, outline_alpha))
	style.backcolor = pysubs2.Color(*_hex_to_rgba(shadow_color, shadow_alpha))
	style.bold = bold
	style.italic = italic
	style.underline = underline
	style.outline = outline_size
	style.shadow = shadow_size
	style.alignment = alignment
	style.marginl = margin_l
	style.marginr = margin_r
	style.marginv = margin_v
	style.scalex = scale_x
	style.scaley = scale_y
	style.spacing = spacing
	style.angle = angle
	style.borderstyle = 1 # outline + shadow
	return style


	def _hex_to_rgba(hex_color: str, alpha_0_255: int = 0):
	"""Convert #RRGGBB to (R, G, B, A) where A=0 is opaque."""
	hex_color = hex_color.lstrip("#")
	if len(hex_color) == 6:
	r = int(hex_color[0:2], 16)
	g = int(hex_color[2:4], 16)
	b = int(hex_color[4:6], 16)
	else:
	r, g, b = 255, 255, 255
	return r, g, b, alpha_0_255


	def generate_subtitles(
	transcript: dict,
	output_path: Path,
	style_config: dict,
	clip_start_offset: float = 0.0,
	) -> Path:
	"""Generate .ass subtitle file from transcript.

	Args:
	transcript: Output from whisper.py
	output_path: Where to save the .ass file
	style_config: Dict with font/color/animation settings from frontend
	clip_start_offset: Shift all timestamps (for sub-clips from longer video)
	"""
	subs = SSAFile()
	subs.info["PlayResX"] = "1080"
	subs.info["PlayResY"] = "1920"
	subs.info["ScaledBorderAndShadow"] = "yes"
	subs.info["WrapStyle"] = "0"

	display_mode = style_config.get("display_mode", "word") # "word" or "sentence"
	animation = style_config.get("animation", "none") # none\|fade\|karaoke\|pop\|typewriter\|bounce
	subtitle_lang = style_config.get("subtitle_language", "en")
	char_level = transcript.get("char_level", False) or subtitle_lang in CHAR_LEVEL_LANGUAGES

	font_family = style_config.get("font_family") or DEFAULT_FONTS.get(subtitle_lang, DEFAULT_FONTS["default"])

	style = build_style(
	font_family=font_family,
	font_size=style_config.get("font_size", 72),
	primary_color=style_config.get("primary_color", "#FFFFFF"),
	secondary_color=style_config.get("secondary_color", "#FFFF00"),
	outline_color=style_config.get("outline_color", "#000000"),
	shadow_color=style_config.get("shadow_color", "#000000"),
	primary_alpha=style_config.get("primary_alpha", 0),
	outline_alpha=style_config.get("outline_alpha", 0),
	shadow_alpha=style_config.get("shadow_alpha", 80),
	bold=style_config.get("bold", True),
	italic=style_config.get("italic", False),
	underline=style_config.get("underline", False),
	outline_size=style_config.get("outline_size", 4.0),
	shadow_size=style_config.get("shadow_size", 2.0),
	alignment=style_config.get("alignment", 2),
	margin_l=style_config.get("margin_l", 40),
	margin_r=style_config.get("margin_r", 40),
	margin_v=style_config.get("margin_v", 250),
	scale_x=style_config.get("scale_x", 100),
	scale_y=style_config.get("scale_y", 100),
	spacing=style_config.get("spacing", 0.0),
	angle=style_config.get("angle", 0.0),
	)
	subs.styles["Default"] = style

	segments = transcript.get("segments", [])

	for seg in segments:
	words = seg.get("words", [])
	seg_end = seg["end"] - clip_start_offset
	if seg_end <= 0:
	continue # segment ends before clip starts — skip entirely

	seg_start = max(0.0, seg["start"] - clip_start_offset)

	if display_mode == "sentence" or not words:
	_add_sentence_event(subs, seg["text"], seg_start, seg_end, animation, style_config)
	else:
	if animation == "karaoke":
	_add_karaoke_line(subs, words, seg_start, seg_end, clip_start_offset, char_level)
	else:
	_add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_start_offset)

	output_path.parent.mkdir(parents=True, exist_ok=True)
	subs.save(str(output_path), encoding="utf-8")
	logger.info(f"Generated {len(subs)} subtitle events → {output_path.name}")
	return output_path


	def _add_sentence_event(subs, text, start, end, animation, style_config):
	tags = ""
	if animation == "fade":
	fi = style_config.get("fade_in_ms", 200)
	fo = style_config.get("fade_out_ms", 200)
	tags = _fade_tags(fi, fo)
	elif animation == "pop":
	tags = _pop_tags()
	elif animation == "bounce":
	tags = _bounce_tags()

	event = SSAEvent(
	start=pysubs2.make_time(s=start),
	end=pysubs2.make_time(s=end),
	text=tags + text.strip(),
	)
	subs.append(event)


	def _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_offset=0.0):
	"""Add one SSAEvent per word (word-by-word mode)."""
	unit_list = []
	for w in words:
	if char_level:
	for ch in w["word"]:
	unit_list.append({"word": ch, "start": w["start"], "end": w["end"]})
	else:
	unit_list.append(w)

	for i, unit in enumerate(unit_list):
	start = unit["start"] - clip_offset
	end = (unit["end"] - clip_offset) if unit["end"] > unit["start"] else start + 0.3
	if start < 0:
	continue

	tags = ""
	if animation == "fade":
	fi = style_config.get("fade_in_ms", 150)
	fo = style_config.get("fade_out_ms", 100)
	tags = _fade_tags(fi, fo)
	elif animation == "pop":
	tags = _pop_tags()
	elif animation == "typewriter":
	delay = int((start - seg_start) * 1000)
	tags = _typewriter_per_char("", delay)

	event = SSAEvent(
	start=pysubs2.make_time(s=start),
	end=pysubs2.make_time(s=end),
	text=tags + unit["word"].strip(),
	)
	subs.append(event)


	def _add_karaoke_line(subs, words, seg_start, seg_end, clip_offset, char_level):
	"""Add karaoke-style line: full line visible, words highlight in sequence."""
	karaoke_text = ""
	for w in words:
	duration_cs = int((w["end"] - w["start"]) * 100)
	word_text = w["word"].strip()
	if char_level:
	for ch in word_text:
	karaoke_text += f"{{\\kf{duration_cs // max(len(word_text), 1)}}}{ch}"
	else:
	karaoke_text += f"{{\\kf{duration_cs}}}{word_text} "

	event = SSAEvent(
	start=pysubs2.make_time(s=seg_start),
	end=pysubs2.make_time(s=seg_end),
	text=karaoke_text.strip(),
	)
	subs.append(event)


	def update_subtitle_event(
	ass_path: Path,
	event_index: int,
	updates: dict,
	) -> Path:
	"""Update a single subtitle event (for editor patches)."""
	subs = SSAFile.load(str(ass_path))
	if event_index >= len(subs):
	raise IndexError(f"Event index {event_index} out of range")

	evt = subs[event_index]
	if "text" in updates:
	evt.text = updates["text"]
	if "start" in updates:
	evt.start = pysubs2.make_time(s=updates["start"])
	if "end" in updates:
	evt.end = pysubs2.make_time(s=updates["end"])

	subs.save(str(ass_path), encoding="utf-8")
	return ass_path


	def apply_global_style_override(ass_path: Path, style_config: dict) -> Path:
	"""Re-apply global style overrides to all events (for live preview)."""
	subs = SSAFile.load(str(ass_path))
	new_style = build_style(**{k: v for k, v in style_config.items() if k in build_style.__code__.co_varnames})
	subs.styles["Default"] = new_style
	subs.save(str(ass_path), encoding="utf-8")
	return ass_path