rikhoffbauer2
/

lyric-sync

ml-intern

Model card Files Files and versions

xet

Community

rikhoffbauer2 commited on 28 days ago

Commit

4073070

verified ·

1 Parent(s): 1be2d0f

Upload lyric_sync/output.py

Browse files

Files changed (1) hide show

lyric_sync/output.py +257 -0

lyric_sync/output.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Output formatters for synchronized lyrics.
+Supports multiple standard formats:
+- LRC (Enhanced): Word-level timestamps in LRC format
+- JSON: Structured word-level data
+- SRT: Subtitle format (line-level)
+- ASS: Advanced SubStation Alpha (word-level karaoke)
+- Plain text with inline timestamps
+"""
+import json
+from typing import Optional
+from lyric_sync.transcribe import TimedWord
+def to_enhanced_lrc(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
+    """
+    Format as Enhanced LRC with word-level timestamps.
+    Enhanced LRC format:
+    [MM:SS.cc] <MM:SS.cc> word1 <MM:SS.cc> word2 <MM:SS.cc> word3
+    Args:
+        words: Timed words
+        line_break_gap: Seconds of gap to trigger a new line (default 1.0s)
+    """
+    if not words:
+        return ""
+    lines = []
+    current_line_words = []
+    current_line_start = words[0].start
+    for i, word in enumerate(words):
+        # Detect line breaks based on gaps between words
+        if current_line_words:
+            prev_end = current_line_words[-1].end
+            if word.start - prev_end > line_break_gap:
+                # Emit current line
+                lines.append(_format_lrc_line(current_line_words, current_line_start))
+                current_line_words = []
+                current_line_start = word.start
+        current_line_words.append(word)
+    # Emit final line
+    if current_line_words:
+        lines.append(_format_lrc_line(current_line_words, current_line_start))
+    return "\n".join(lines)
+def _format_lrc_line(words: list[TimedWord], line_start: float) -> str:
+    """Format a single Enhanced LRC line."""
+    line_ts = _format_lrc_timestamp(line_start)
+    word_parts = []
+    for word in words:
+        word_ts = _format_lrc_timestamp(word.start)
+        word_parts.append(f"<{word_ts}> {word.word}")
+    # Add end timestamp
+    end_ts = _format_lrc_timestamp(words[-1].end)
+    return f"[{line_ts}] {' '.join(f'<{_format_lrc_timestamp(w.start)}> {w.word}' for w in words)} <{end_ts}>"
+def _format_lrc_timestamp(seconds: float) -> str:
+    """Format seconds as MM:SS.cc (LRC standard)."""
+    minutes = int(seconds // 60)
+    secs = seconds % 60
+    return f"{minutes:02d}:{secs:05.2f}"
+def to_standard_lrc(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
+    """
+    Format as standard LRC (line-level timestamps only).
+    [MM:SS.cc] Line of lyrics text
+    """
+    if not words:
+        return ""
+    lines = []
+    current_line_words = []
+    current_line_start = words[0].start
+    for word in words:
+        if current_line_words:
+            prev_end = current_line_words[-1].end
+            if word.start - prev_end > line_break_gap:
+                ts = _format_lrc_timestamp(current_line_start)
+                text = " ".join(w.word for w in current_line_words)
+                lines.append(f"[{ts}] {text}")
+                current_line_words = []
+                current_line_start = word.start
+        current_line_words.append(word)
+    if current_line_words:
+        ts = _format_lrc_timestamp(current_line_start)
+        text = " ".join(w.word for w in current_line_words)
+        lines.append(f"[{ts}] {text}")
+    return "\n".join(lines)
+def to_json(words: list[TimedWord], indent: int = 2) -> str:
+    """
+    Format as JSON array of word objects.
+    [{"word": "hello", "start": 0.123, "end": 0.456, "confidence": 0.95}, ...]
+    """
+    data = [
+        {
+            "word": w.word,
+            "start": round(w.start, 3),
+            "end": round(w.end, 3),
+            "confidence": round(w.confidence, 3),
+        }
+        for w in words
+    ]
+    return json.dumps(data, indent=indent, ensure_ascii=False)
+def to_srt(words: list[TimedWord], line_break_gap: float = 1.0, max_words_per_line: int = 10) -> str:
+    """
+    Format as SRT subtitles (line-level).
+    1
+    00:00:01,230 --> 00:00:03,456
+    Line of lyrics text
+    """
+    if not words:
+        return ""
+    entries = []
+    current_words = []
+    current_start = words[0].start
+    for word in words:
+        if current_words:
+            prev_end = current_words[-1].end
+            if word.start - prev_end > line_break_gap or len(current_words) >= max_words_per_line:
+                entries.append((current_start, current_words[-1].end, current_words))
+                current_words = []
+                current_start = word.start
+        current_words.append(word)
+    if current_words:
+        entries.append((current_start, current_words[-1].end, current_words))
+    srt_lines = []
+    for idx, (start, end, line_words) in enumerate(entries, 1):
+        start_ts = _format_srt_timestamp(start)
+        end_ts = _format_srt_timestamp(end)
+        text = " ".join(w.word for w in line_words)
+        srt_lines.append(f"{idx}\n{start_ts} --> {end_ts}\n{text}\n")
+    return "\n".join(srt_lines)
+def _format_srt_timestamp(seconds: float) -> str:
+    """Format seconds as HH:MM:SS,mmm (SRT standard)."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    millis = int((secs % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{int(secs):02d},{millis:03d}"
+def to_ass_karaoke(
+    words: list[TimedWord],
+    line_break_gap: float = 1.0,
+    style_name: str = "Default",
+) -> str:
+    """
+    Format as ASS (Advanced SubStation Alpha) with karaoke timing.
+    Uses \\k tags for word-level karaoke highlighting.
+    Each \\kN tag specifies duration in centiseconds until next word highlights.
+    """
+    if not words:
+        return ""
+    header = f"""[Script Info]
+Title: Synced Lyrics
+ScriptType: v4.00+
+PlayResX: 1920
+PlayResY: 1080
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: {style_name},Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,1,2,10,10,40,1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    # Group words into lines
+    line_groups = []
+    current_line = []
+    for word in words:
+        if current_line:
+            prev_end = current_line[-1].end
+            if word.start - prev_end > line_break_gap:
+                line_groups.append(current_line)
+                current_line = []
+        current_line.append(word)
+    if current_line:
+        line_groups.append(current_line)
+    events = []
+    for line_words in line_groups:
+        start = _format_ass_timestamp(line_words[0].start)
+        end = _format_ass_timestamp(line_words[-1].end)
+        # Build karaoke text with \k tags
+        karaoke_parts = []
+        for w in line_words:
+            duration_cs = int(w.duration * 100)  # centiseconds
+            karaoke_parts.append(f"{{\\kf{duration_cs}}}{w.word}")
+        text = " ".join(karaoke_parts)
+        events.append(f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{text}")
+    return header + "\n".join(events)
+def _format_ass_timestamp(seconds: float) -> str:
+    """Format seconds as H:MM:SS.cc (ASS standard)."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    centis = int((secs % 1) * 100)
+    return f"{hours}:{minutes:02d}:{int(secs):02d}.{centis:02d}"
+def to_plain_inline(words: list[TimedWord], line_break_gap: float = 1.0) -> str:
+    """
+    Plain text with inline timestamps for readability.
+    [0:01.23] Hello world [0:02.45] this is a song
+    """
+    if not words:
+        return ""
+    parts = []
+    prev_end = 0.0
+    for word in words:
+        if word.start - prev_end > line_break_gap:
+            parts.append("\n")
+        ts = _format_lrc_timestamp(word.start)
+        parts.append(f"[{ts}] {word.word}")
+        prev_end = word.end
+    return " ".join(parts).replace(" \n ", "\n")