Upload lyric_sync/output.py

4073070 verified 18 days ago

7.95 kB

	"""
	Output formatters for synchronized lyrics.

	Supports multiple standard formats:
	- LRC (Enhanced): Word-level timestamps in LRC format
	- JSON: Structured word-level data
	- SRT: Subtitle format (line-level)
	- ASS: Advanced SubStation Alpha (word-level karaoke)
	- Plain text with inline timestamps
	"""

	import json
	from typing import Optional

	from lyric_sync.transcribe import TimedWord


	def to_enhanced_lrc(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
	"""
	Format as Enhanced LRC with word-level timestamps.

	Enhanced LRC format:
	[MM:SS.cc] <MM:SS.cc> word1 <MM:SS.cc> word2 <MM:SS.cc> word3

	Args:
	words: Timed words
	line_break_gap: Seconds of gap to trigger a new line (default 1.0s)
	"""
	if not words:
	return ""

	lines = []
	current_line_words = []
	current_line_start = words[0].start

	for i, word in enumerate(words):
	# Detect line breaks based on gaps between words
	if current_line_words:
	prev_end = current_line_words[-1].end
	if word.start - prev_end > line_break_gap:
	# Emit current line
	lines.append(_format_lrc_line(current_line_words, current_line_start))
	current_line_words = []
	current_line_start = word.start

	current_line_words.append(word)

	# Emit final line
	if current_line_words:
	lines.append(_format_lrc_line(current_line_words, current_line_start))

	return "\n".join(lines)


	def _format_lrc_line(words: list[TimedWord], line_start: float) -> str:
	"""Format a single Enhanced LRC line."""
	line_ts = _format_lrc_timestamp(line_start)
	word_parts = []
	for word in words:
	word_ts = _format_lrc_timestamp(word.start)
	word_parts.append(f"<{word_ts}> {word.word}")
	# Add end timestamp
	end_ts = _format_lrc_timestamp(words[-1].end)
	return f"[{line_ts}] {' '.join(f'<{_format_lrc_timestamp(w.start)}> {w.word}' for w in words)} <{end_ts}>"


	def _format_lrc_timestamp(seconds: float) -> str:
	"""Format seconds as MM:SS.cc (LRC standard)."""
	minutes = int(seconds // 60)
	secs = seconds % 60
	return f"{minutes:02d}:{secs:05.2f}"


	def to_standard_lrc(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
	"""
	Format as standard LRC (line-level timestamps only).

	[MM:SS.cc] Line of lyrics text
	"""
	if not words:
	return ""

	lines = []
	current_line_words = []
	current_line_start = words[0].start

	for word in words:
	if current_line_words:
	prev_end = current_line_words[-1].end
	if word.start - prev_end > line_break_gap:
	ts = _format_lrc_timestamp(current_line_start)
	text = " ".join(w.word for w in current_line_words)
	lines.append(f"[{ts}] {text}")
	current_line_words = []
	current_line_start = word.start

	current_line_words.append(word)

	if current_line_words:
	ts = _format_lrc_timestamp(current_line_start)
	text = " ".join(w.word for w in current_line_words)
	lines.append(f"[{ts}] {text}")

	return "\n".join(lines)


	def to_json(words: list[TimedWord], indent: int = 2) -> str:
	"""
	Format as JSON array of word objects.

	[{"word": "hello", "start": 0.123, "end": 0.456, "confidence": 0.95}, ...]
	"""
	data = [
	{
	"word": w.word,
	"start": round(w.start, 3),
	"end": round(w.end, 3),
	"confidence": round(w.confidence, 3),
	}
	for w in words
	]
	return json.dumps(data, indent=indent, ensure_ascii=False)


	def to_srt(words: list[TimedWord], line_break_gap: float = 1.0, max_words_per_line: int = 10) -> str:
	"""
	Format as SRT subtitles (line-level).

	1
	00:00:01,230 --> 00:00:03,456
	Line of lyrics text
	"""
	if not words:
	return ""

	entries = []
	current_words = []
	current_start = words[0].start

	for word in words:
	if current_words:
	prev_end = current_words[-1].end
	if word.start - prev_end > line_break_gap or len(current_words) >= max_words_per_line:
	entries.append((current_start, current_words[-1].end, current_words))
	current_words = []
	current_start = word.start
	current_words.append(word)

	if current_words:
	entries.append((current_start, current_words[-1].end, current_words))

	srt_lines = []
	for idx, (start, end, line_words) in enumerate(entries, 1):
	start_ts = _format_srt_timestamp(start)
	end_ts = _format_srt_timestamp(end)
	text = " ".join(w.word for w in line_words)
	srt_lines.append(f"{idx}\n{start_ts} --> {end_ts}\n{text}\n")

	return "\n".join(srt_lines)


	def _format_srt_timestamp(seconds: float) -> str:
	"""Format seconds as HH:MM:SS,mmm (SRT standard)."""
	hours = int(seconds // 3600)
	minutes = int((seconds % 3600) // 60)
	secs = seconds % 60
	millis = int((secs % 1) * 1000)
	return f"{hours:02d}:{minutes:02d}:{int(secs):02d},{millis:03d}"


	def to_ass_karaoke(
	words: list[TimedWord],
	line_break_gap: float = 1.0,
	style_name: str = "Default",
	) -> str:
	"""
	Format as ASS (Advanced SubStation Alpha) with karaoke timing.

	Uses \\k tags for word-level karaoke highlighting.
	Each \\kN tag specifies duration in centiseconds until next word highlights.
	"""
	if not words:
	return ""

	header = f"""[Script Info]
	Title: Synced Lyrics
	ScriptType: v4.00+
	PlayResX: 1920
	PlayResY: 1080

	[V4+ Styles]
	Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
	Style: {style_name},Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,1,2,10,10,40,1

	[Events]
	Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
	"""

	# Group words into lines
	line_groups = []
	current_line = []
	for word in words:
	if current_line:
	prev_end = current_line[-1].end
	if word.start - prev_end > line_break_gap:
	line_groups.append(current_line)
	current_line = []
	current_line.append(word)
	if current_line:
	line_groups.append(current_line)

	events = []
	for line_words in line_groups:
	start = _format_ass_timestamp(line_words[0].start)
	end = _format_ass_timestamp(line_words[-1].end)

	# Build karaoke text with \k tags
	karaoke_parts = []
	for w in line_words:
	duration_cs = int(w.duration * 100) # centiseconds
	karaoke_parts.append(f"{{\\kf{duration_cs}}}{w.word}")

	text = " ".join(karaoke_parts)
	events.append(f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{text}")

	return header + "\n".join(events)


	def _format_ass_timestamp(seconds: float) -> str:
	"""Format seconds as H:MM:SS.cc (ASS standard)."""
	hours = int(seconds // 3600)
	minutes = int((seconds % 3600) // 60)
	secs = seconds % 60
	centis = int((secs % 1) * 100)
	return f"{hours}:{minutes:02d}:{int(secs):02d}.{centis:02d}"


	def to_plain_inline(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
	"""
	Plain text with inline timestamps for readability.

	[0:01.23] Hello world [0:02.45] this is a song
	"""
	if not words:
	return ""

	parts = []
	prev_end = 0.0

	for word in words:
	if word.start - prev_end > line_break_gap:
	parts.append("\n")
	ts = _format_lrc_timestamp(word.start)
	parts.append(f"[{ts}] {word.word}")
	prev_end = word.end

	return " ".join(parts).replace(" \n ", "\n")