lyric-sync / lyric_sync /lyrics.py
rikhoffbauer2's picture
Upload lyric_sync/lyrics.py
1314870 verified
"""
Lyrics acquisition from online databases.
Fetches reference (correct) lyrics given artist + title.
Supports synced LRC format and plain text.
Priority sources:
1. LRCLIB (free, no auth, synced LRC available)
2. syncedlyrics library (multi-source aggregator)
3. Genius (plain text fallback, requires API key)
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Optional
import requests
logger = logging.getLogger(__name__)
@dataclass
class LyricLine:
"""A single line of lyrics with optional timing."""
text: str
timestamp: Optional[float] = None # seconds (from LRC format)
@property
def words(self) -> list[str]:
"""Split line into words."""
return self.text.split()
@dataclass
class Lyrics:
"""Complete lyrics for a song."""
plain_text: str
lines: list[LyricLine] = field(default_factory=list)
synced: bool = False # Whether line-level timestamps are available
source: str = "unknown"
@property
def words(self) -> list[str]:
"""All words in the lyrics, preserving order."""
return self.plain_text.split()
@property
def word_count(self) -> int:
return len(self.words)
def parse_lrc(lrc_text: str) -> list[LyricLine]:
"""
Parse LRC format into LyricLine objects.
LRC format: [MM:SS.cs] Lyrics text here
Enhanced LRC: [MM:SS.cs] <MM:SS.cs> word <MM:SS.cs> word ...
"""
lines = []
# Pattern: [MM:SS.cc] or [MM:SS.ccc]
pattern = r"\[(\d{2}):(\d{2})\.(\d{2,3})\]\s*(.*)"
for raw_line in lrc_text.strip().split("\n"):
raw_line = raw_line.strip()
if not raw_line:
continue
match = re.match(pattern, raw_line)
if match:
minutes = int(match.group(1))
seconds = int(match.group(2))
centiseconds = match.group(3)
# Handle both 2-digit (centiseconds) and 3-digit (milliseconds)
if len(centiseconds) == 2:
frac = int(centiseconds) / 100.0
else:
frac = int(centiseconds) / 1000.0
timestamp = minutes * 60 + seconds + frac
text = match.group(4).strip()
# Strip enhanced LRC word-level tags if present
text = re.sub(r"<\d{2}:\d{2}\.\d{2,3}>", "", text).strip()
if text: # Skip empty lines (instrumental markers)
lines.append(LyricLine(text=text, timestamp=timestamp))
else:
# Non-timestamped line (metadata like [ar:Artist] or plain text)
if not raw_line.startswith("["):
lines.append(LyricLine(text=raw_line))
return lines
class LRCLIBFetcher:
"""
Fetch lyrics from LRCLIB.net — free, no auth, community-maintained.
Returns both synced LRC and plain text when available.
"""
BASE_URL = "https://lrclib.net/api"
def fetch(
self,
artist: str,
title: str,
album: Optional[str] = None,
duration: Optional[float] = None,
) -> Optional[Lyrics]:
"""
Fetch lyrics by metadata match.
Args:
artist: Artist name
title: Track title
album: Album name (optional, improves match accuracy)
duration: Track duration in seconds (optional)
"""
params = {
"artist_name": artist,
"track_name": title,
}
if album:
params["album_name"] = album
if duration:
params["duration"] = int(duration)
try:
resp = requests.get(f"{self.BASE_URL}/get", params=params, timeout=10)
if resp.status_code == 404:
logger.debug(f"LRCLIB: no match for {artist} - {title}")
return None
resp.raise_for_status()
data = resp.json()
except (requests.RequestException, ValueError) as e:
logger.warning(f"LRCLIB request failed: {e}")
return None
synced_lrc = data.get("syncedLyrics")
plain = data.get("plainLyrics", "")
if synced_lrc:
lines = parse_lrc(synced_lrc)
return Lyrics(
plain_text=plain or "\n".join(l.text for l in lines),
lines=lines,
synced=True,
source="lrclib",
)
elif plain:
lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")
return None
def search(self, query: str) -> Optional[Lyrics]:
"""Search LRCLIB by text query (fuzzy)."""
try:
resp = requests.get(f"{self.BASE_URL}/search", params={"q": query}, timeout=10)
if resp.status_code != 200:
return None
results = resp.json()
if not results:
return None
# Take best result
data = results[0]
synced_lrc = data.get("syncedLyrics")
plain = data.get("plainLyrics", "")
if synced_lrc:
lines = parse_lrc(synced_lrc)
return Lyrics(
plain_text=plain or "\n".join(l.text for l in lines),
lines=lines,
synced=True,
source="lrclib",
)
elif plain:
lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")
except (requests.RequestException, ValueError) as e:
logger.debug(f"LRCLIB search failed: {e}")
return None
class SyncedLyricsFetcher:
"""
Multi-source fetcher using the syncedlyrics library.
Tries: Lrclib → NetEase → Musixmatch → Megalobiz
"""
def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
"""Fetch synced lyrics using multiple providers."""
try:
import syncedlyrics
except ImportError:
logger.warning("syncedlyrics not installed. pip install syncedlyrics")
return None
query = f"{artist} {title}"
try:
lrc_text = syncedlyrics.search(
query,
providers=["Lrclib", "NetEase", "Musixmatch", "Megalobiz"],
allow_plain_format=True,
)
except Exception as e:
logger.warning(f"syncedlyrics search failed: {e}")
return None
if not lrc_text:
return None
# Check if it's LRC format (has timestamps)
if re.search(r"\[\d{2}:\d{2}\.\d{2,3}\]", lrc_text):
lines = parse_lrc(lrc_text)
return Lyrics(
plain_text="\n".join(l.text for l in lines),
lines=lines,
synced=True,
source="syncedlyrics",
)
else:
lines = [LyricLine(text=l.strip()) for l in lrc_text.split("\n") if l.strip()]
return Lyrics(
plain_text=lrc_text,
lines=lines,
synced=False,
source="syncedlyrics",
)
class GeniusFetcher:
"""
Fetch plain-text lyrics from Genius.
Requires API token. No synced/timed lyrics available.
"""
def __init__(self, token: str):
self.token = token
def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
"""Fetch lyrics from Genius API."""
try:
import lyricsgenius
except ImportError:
logger.warning("lyricsgenius not installed. pip install lyricsgenius")
return None
try:
genius = lyricsgenius.Genius(self.token, verbose=False)
genius.remove_section_headers = True
song = genius.search_song(title, artist)
if song and song.lyrics:
# Clean up Genius formatting artifacts
text = self._clean_genius_lyrics(song.lyrics)
lines = [LyricLine(text=l.strip()) for l in text.split("\n") if l.strip()]
return Lyrics(plain_text=text, lines=lines, synced=False, source="genius")
except Exception as e:
logger.warning(f"Genius fetch failed: {e}")
return None
@staticmethod
def _clean_genius_lyrics(raw: str) -> str:
"""Remove Genius-specific formatting."""
# Remove section headers like [Chorus], [Verse 1]
text = re.sub(r"\[.*?\]", "", raw)
# Remove "XEmbed" suffix and contributor info
text = re.sub(r"\d+Embed$", "", text)
text = re.sub(r"You might also like", "", text)
# Clean up multiple blank lines
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def fetch_lyrics(
artist: str,
title: str,
album: Optional[str] = None,
duration: Optional[float] = None,
genius_token: Optional[str] = None,
) -> Optional[Lyrics]:
"""
Fetch lyrics using the best available source.
Priority:
1. LRCLIB (free, synced, no auth)
2. syncedlyrics (multi-source, synced)
3. Genius (plain text, requires token)
Args:
artist: Artist name
title: Track title
album: Album name (optional)
duration: Track duration in seconds (optional)
genius_token: Genius API token (optional, for fallback)
Returns:
Lyrics object or None
"""
# 1. LRCLIB
lrclib = LRCLIBFetcher()
result = lrclib.fetch(artist, title, album, duration)
if result:
logger.info(f"Lyrics from LRCLIB (synced={result.synced}): {len(result.words)} words")
return result
# 2. syncedlyrics multi-source
synced = SyncedLyricsFetcher()
result = synced.fetch(artist, title)
if result:
logger.info(f"Lyrics from syncedlyrics (synced={result.synced}): {len(result.words)} words")
return result
# 3. Genius (plain text fallback)
if genius_token:
genius = GeniusFetcher(genius_token)
result = genius.fetch(artist, title)
if result:
logger.info(f"Lyrics from Genius (plain): {len(result.words)} words")
return result
logger.warning(f"No lyrics found for: {artist} - {title}")
return None