""" Lyrics acquisition from online databases. Fetches reference (correct) lyrics given artist + title. Supports synced LRC format and plain text. Priority sources: 1. LRCLIB (free, no auth, synced LRC available) 2. syncedlyrics library (multi-source aggregator) 3. Genius (plain text fallback, requires API key) """ import logging import re from dataclasses import dataclass, field from typing import Optional import requests logger = logging.getLogger(__name__) @dataclass class LyricLine: """A single line of lyrics with optional timing.""" text: str timestamp: Optional[float] = None # seconds (from LRC format) @property def words(self) -> list[str]: """Split line into words.""" return self.text.split() @dataclass class Lyrics: """Complete lyrics for a song.""" plain_text: str lines: list[LyricLine] = field(default_factory=list) synced: bool = False # Whether line-level timestamps are available source: str = "unknown" @property def words(self) -> list[str]: """All words in the lyrics, preserving order.""" return self.plain_text.split() @property def word_count(self) -> int: return len(self.words) def parse_lrc(lrc_text: str) -> list[LyricLine]: """ Parse LRC format into LyricLine objects. LRC format: [MM:SS.cs] Lyrics text here Enhanced LRC: [MM:SS.cs] word word ... """ lines = [] # Pattern: [MM:SS.cc] or [MM:SS.ccc] pattern = r"\[(\d{2}):(\d{2})\.(\d{2,3})\]\s*(.*)" for raw_line in lrc_text.strip().split("\n"): raw_line = raw_line.strip() if not raw_line: continue match = re.match(pattern, raw_line) if match: minutes = int(match.group(1)) seconds = int(match.group(2)) centiseconds = match.group(3) # Handle both 2-digit (centiseconds) and 3-digit (milliseconds) if len(centiseconds) == 2: frac = int(centiseconds) / 100.0 else: frac = int(centiseconds) / 1000.0 timestamp = minutes * 60 + seconds + frac text = match.group(4).strip() # Strip enhanced LRC word-level tags if present text = re.sub(r"<\d{2}:\d{2}\.\d{2,3}>", "", text).strip() if text: # Skip empty lines (instrumental markers) lines.append(LyricLine(text=text, timestamp=timestamp)) else: # Non-timestamped line (metadata like [ar:Artist] or plain text) if not raw_line.startswith("["): lines.append(LyricLine(text=raw_line)) return lines class LRCLIBFetcher: """ Fetch lyrics from LRCLIB.net — free, no auth, community-maintained. Returns both synced LRC and plain text when available. """ BASE_URL = "https://lrclib.net/api" def fetch( self, artist: str, title: str, album: Optional[str] = None, duration: Optional[float] = None, ) -> Optional[Lyrics]: """ Fetch lyrics by metadata match. Args: artist: Artist name title: Track title album: Album name (optional, improves match accuracy) duration: Track duration in seconds (optional) """ params = { "artist_name": artist, "track_name": title, } if album: params["album_name"] = album if duration: params["duration"] = int(duration) try: resp = requests.get(f"{self.BASE_URL}/get", params=params, timeout=10) if resp.status_code == 404: logger.debug(f"LRCLIB: no match for {artist} - {title}") return None resp.raise_for_status() data = resp.json() except (requests.RequestException, ValueError) as e: logger.warning(f"LRCLIB request failed: {e}") return None synced_lrc = data.get("syncedLyrics") plain = data.get("plainLyrics", "") if synced_lrc: lines = parse_lrc(synced_lrc) return Lyrics( plain_text=plain or "\n".join(l.text for l in lines), lines=lines, synced=True, source="lrclib", ) elif plain: lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()] return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib") return None def search(self, query: str) -> Optional[Lyrics]: """Search LRCLIB by text query (fuzzy).""" try: resp = requests.get(f"{self.BASE_URL}/search", params={"q": query}, timeout=10) if resp.status_code != 200: return None results = resp.json() if not results: return None # Take best result data = results[0] synced_lrc = data.get("syncedLyrics") plain = data.get("plainLyrics", "") if synced_lrc: lines = parse_lrc(synced_lrc) return Lyrics( plain_text=plain or "\n".join(l.text for l in lines), lines=lines, synced=True, source="lrclib", ) elif plain: lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()] return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib") except (requests.RequestException, ValueError) as e: logger.debug(f"LRCLIB search failed: {e}") return None class SyncedLyricsFetcher: """ Multi-source fetcher using the syncedlyrics library. Tries: Lrclib → NetEase → Musixmatch → Megalobiz """ def fetch(self, artist: str, title: str) -> Optional[Lyrics]: """Fetch synced lyrics using multiple providers.""" try: import syncedlyrics except ImportError: logger.warning("syncedlyrics not installed. pip install syncedlyrics") return None query = f"{artist} {title}" try: lrc_text = syncedlyrics.search( query, providers=["Lrclib", "NetEase", "Musixmatch", "Megalobiz"], allow_plain_format=True, ) except Exception as e: logger.warning(f"syncedlyrics search failed: {e}") return None if not lrc_text: return None # Check if it's LRC format (has timestamps) if re.search(r"\[\d{2}:\d{2}\.\d{2,3}\]", lrc_text): lines = parse_lrc(lrc_text) return Lyrics( plain_text="\n".join(l.text for l in lines), lines=lines, synced=True, source="syncedlyrics", ) else: lines = [LyricLine(text=l.strip()) for l in lrc_text.split("\n") if l.strip()] return Lyrics( plain_text=lrc_text, lines=lines, synced=False, source="syncedlyrics", ) class GeniusFetcher: """ Fetch plain-text lyrics from Genius. Requires API token. No synced/timed lyrics available. """ def __init__(self, token: str): self.token = token def fetch(self, artist: str, title: str) -> Optional[Lyrics]: """Fetch lyrics from Genius API.""" try: import lyricsgenius except ImportError: logger.warning("lyricsgenius not installed. pip install lyricsgenius") return None try: genius = lyricsgenius.Genius(self.token, verbose=False) genius.remove_section_headers = True song = genius.search_song(title, artist) if song and song.lyrics: # Clean up Genius formatting artifacts text = self._clean_genius_lyrics(song.lyrics) lines = [LyricLine(text=l.strip()) for l in text.split("\n") if l.strip()] return Lyrics(plain_text=text, lines=lines, synced=False, source="genius") except Exception as e: logger.warning(f"Genius fetch failed: {e}") return None @staticmethod def _clean_genius_lyrics(raw: str) -> str: """Remove Genius-specific formatting.""" # Remove section headers like [Chorus], [Verse 1] text = re.sub(r"\[.*?\]", "", raw) # Remove "XEmbed" suffix and contributor info text = re.sub(r"\d+Embed$", "", text) text = re.sub(r"You might also like", "", text) # Clean up multiple blank lines text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def fetch_lyrics( artist: str, title: str, album: Optional[str] = None, duration: Optional[float] = None, genius_token: Optional[str] = None, ) -> Optional[Lyrics]: """ Fetch lyrics using the best available source. Priority: 1. LRCLIB (free, synced, no auth) 2. syncedlyrics (multi-source, synced) 3. Genius (plain text, requires token) Args: artist: Artist name title: Track title album: Album name (optional) duration: Track duration in seconds (optional) genius_token: Genius API token (optional, for fallback) Returns: Lyrics object or None """ # 1. LRCLIB lrclib = LRCLIBFetcher() result = lrclib.fetch(artist, title, album, duration) if result: logger.info(f"Lyrics from LRCLIB (synced={result.synced}): {len(result.words)} words") return result # 2. syncedlyrics multi-source synced = SyncedLyricsFetcher() result = synced.fetch(artist, title) if result: logger.info(f"Lyrics from syncedlyrics (synced={result.synced}): {len(result.words)} words") return result # 3. Genius (plain text fallback) if genius_token: genius = GeniusFetcher(genius_token) result = genius.fetch(artist, title) if result: logger.info(f"Lyrics from Genius (plain): {len(result.words)} words") return result logger.warning(f"No lyrics found for: {artist} - {title}") return None