Upload lyric_sync/lyrics.py

1314870 verified 27 days ago

10.4 kB

	"""
	Lyrics acquisition from online databases.

	Fetches reference (correct) lyrics given artist + title.
	Supports synced LRC format and plain text.

	Priority sources:
	1. LRCLIB (free, no auth, synced LRC available)
	2. syncedlyrics library (multi-source aggregator)
	3. Genius (plain text fallback, requires API key)
	"""

	import logging
	import re
	from dataclasses import dataclass, field
	from typing import Optional

	import requests

	logger = logging.getLogger(__name__)


	@dataclass
	class LyricLine:
	"""A single line of lyrics with optional timing."""
	text: str
	timestamp: Optional[float] = None # seconds (from LRC format)

	@property
	def words(self) -> list[str]:
	"""Split line into words."""
	return self.text.split()


	@dataclass
	class Lyrics:
	"""Complete lyrics for a song."""
	plain_text: str
	lines: list[LyricLine] = field(default_factory=list)
	synced: bool = False # Whether line-level timestamps are available
	source: str = "unknown"

	@property
	def words(self) -> list[str]:
	"""All words in the lyrics, preserving order."""
	return self.plain_text.split()

	@property
	def word_count(self) -> int:
	return len(self.words)


	def parse_lrc(lrc_text: str) -> list[LyricLine]:
	"""
	Parse LRC format into LyricLine objects.

	LRC format: [MM:SS.cs] Lyrics text here
	Enhanced LRC: [MM:SS.cs] <MM:SS.cs> word <MM:SS.cs> word ...
	"""
	lines = []
	# Pattern: [MM:SS.cc] or [MM:SS.ccc]
	pattern = r"\[(\d{2}):(\d{2})\.(\d{2,3})\]\s(.)"

	for raw_line in lrc_text.strip().split("\n"):
	raw_line = raw_line.strip()
	if not raw_line:
	continue

	match = re.match(pattern, raw_line)
	if match:
	minutes = int(match.group(1))
	seconds = int(match.group(2))
	centiseconds = match.group(3)
	# Handle both 2-digit (centiseconds) and 3-digit (milliseconds)
	if len(centiseconds) == 2:
	frac = int(centiseconds) / 100.0
	else:
	frac = int(centiseconds) / 1000.0

	timestamp = minutes * 60 + seconds + frac
	text = match.group(4).strip()

	# Strip enhanced LRC word-level tags if present
	text = re.sub(r"<\d{2}:\d{2}\.\d{2,3}>", "", text).strip()

	if text: # Skip empty lines (instrumental markers)
	lines.append(LyricLine(text=text, timestamp=timestamp))
	else:
	# Non-timestamped line (metadata like [ar:Artist] or plain text)
	if not raw_line.startswith("["):
	lines.append(LyricLine(text=raw_line))

	return lines


	class LRCLIBFetcher:
	"""
	Fetch lyrics from LRCLIB.net — free, no auth, community-maintained.
	Returns both synced LRC and plain text when available.
	"""

	BASE_URL = "https://lrclib.net/api"

	def fetch(
	self,
	artist: str,
	title: str,
	album: Optional[str] = None,
	duration: Optional[float] = None,
	) -> Optional[Lyrics]:
	"""
	Fetch lyrics by metadata match.

	Args:
	artist: Artist name
	title: Track title
	album: Album name (optional, improves match accuracy)
	duration: Track duration in seconds (optional)
	"""
	params = {
	"artist_name": artist,
	"track_name": title,
	}
	if album:
	params["album_name"] = album
	if duration:
	params["duration"] = int(duration)

	try:
	resp = requests.get(f"{self.BASE_URL}/get", params=params, timeout=10)
	if resp.status_code == 404:
	logger.debug(f"LRCLIB: no match for {artist} - {title}")
	return None
	resp.raise_for_status()
	data = resp.json()
	except (requests.RequestException, ValueError) as e:
	logger.warning(f"LRCLIB request failed: {e}")
	return None

	synced_lrc = data.get("syncedLyrics")
	plain = data.get("plainLyrics", "")

	if synced_lrc:
	lines = parse_lrc(synced_lrc)
	return Lyrics(
	plain_text=plain or "\n".join(l.text for l in lines),
	lines=lines,
	synced=True,
	source="lrclib",
	)
	elif plain:
	lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
	return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")

	return None

	def search(self, query: str) -> Optional[Lyrics]:
	"""Search LRCLIB by text query (fuzzy)."""
	try:
	resp = requests.get(f"{self.BASE_URL}/search", params={"q": query}, timeout=10)
	if resp.status_code != 200:
	return None
	results = resp.json()
	if not results:
	return None

	# Take best result
	data = results[0]
	synced_lrc = data.get("syncedLyrics")
	plain = data.get("plainLyrics", "")

	if synced_lrc:
	lines = parse_lrc(synced_lrc)
	return Lyrics(
	plain_text=plain or "\n".join(l.text for l in lines),
	lines=lines,
	synced=True,
	source="lrclib",
	)
	elif plain:
	lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
	return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")
	except (requests.RequestException, ValueError) as e:
	logger.debug(f"LRCLIB search failed: {e}")

	return None


	class SyncedLyricsFetcher:
	"""
	Multi-source fetcher using the syncedlyrics library.
	Tries: Lrclib → NetEase → Musixmatch → Megalobiz
	"""

	def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
	"""Fetch synced lyrics using multiple providers."""
	try:
	import syncedlyrics
	except ImportError:
	logger.warning("syncedlyrics not installed. pip install syncedlyrics")
	return None

	query = f"{artist} {title}"
	try:
	lrc_text = syncedlyrics.search(
	query,
	providers=["Lrclib", "NetEase", "Musixmatch", "Megalobiz"],
	allow_plain_format=True,
	)
	except Exception as e:
	logger.warning(f"syncedlyrics search failed: {e}")
	return None

	if not lrc_text:
	return None

	# Check if it's LRC format (has timestamps)
	if re.search(r"\[\d{2}:\d{2}\.\d{2,3}\]", lrc_text):
	lines = parse_lrc(lrc_text)
	return Lyrics(
	plain_text="\n".join(l.text for l in lines),
	lines=lines,
	synced=True,
	source="syncedlyrics",
	)
	else:
	lines = [LyricLine(text=l.strip()) for l in lrc_text.split("\n") if l.strip()]
	return Lyrics(
	plain_text=lrc_text,
	lines=lines,
	synced=False,
	source="syncedlyrics",
	)


	class GeniusFetcher:
	"""
	Fetch plain-text lyrics from Genius.
	Requires API token. No synced/timed lyrics available.
	"""

	def __init__(self, token: str):
	self.token = token

	def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
	"""Fetch lyrics from Genius API."""
	try:
	import lyricsgenius
	except ImportError:
	logger.warning("lyricsgenius not installed. pip install lyricsgenius")
	return None

	try:
	genius = lyricsgenius.Genius(self.token, verbose=False)
	genius.remove_section_headers = True
	song = genius.search_song(title, artist)
	if song and song.lyrics:
	# Clean up Genius formatting artifacts
	text = self._clean_genius_lyrics(song.lyrics)
	lines = [LyricLine(text=l.strip()) for l in text.split("\n") if l.strip()]
	return Lyrics(plain_text=text, lines=lines, synced=False, source="genius")
	except Exception as e:
	logger.warning(f"Genius fetch failed: {e}")

	return None

	@staticmethod
	def _clean_genius_lyrics(raw: str) -> str:
	"""Remove Genius-specific formatting."""
	# Remove section headers like [Chorus], [Verse 1]
	text = re.sub(r"\[.*?\]", "", raw)
	# Remove "XEmbed" suffix and contributor info
	text = re.sub(r"\d+Embed$", "", text)
	text = re.sub(r"You might also like", "", text)
	# Clean up multiple blank lines
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	def fetch_lyrics(
	artist: str,
	title: str,
	album: Optional[str] = None,
	duration: Optional[float] = None,
	genius_token: Optional[str] = None,
	) -> Optional[Lyrics]:
	"""
	Fetch lyrics using the best available source.

	Priority:
	1. LRCLIB (free, synced, no auth)
	2. syncedlyrics (multi-source, synced)
	3. Genius (plain text, requires token)

	Args:
	artist: Artist name
	title: Track title
	album: Album name (optional)
	duration: Track duration in seconds (optional)
	genius_token: Genius API token (optional, for fallback)

	Returns:
	Lyrics object or None
	"""
	# 1. LRCLIB
	lrclib = LRCLIBFetcher()
	result = lrclib.fetch(artist, title, album, duration)
	if result:
	logger.info(f"Lyrics from LRCLIB (synced={result.synced}): {len(result.words)} words")
	return result

	# 2. syncedlyrics multi-source
	synced = SyncedLyricsFetcher()
	result = synced.fetch(artist, title)
	if result:
	logger.info(f"Lyrics from syncedlyrics (synced={result.synced}): {len(result.words)} words")
	return result

	# 3. Genius (plain text fallback)
	if genius_token:
	genius = GeniusFetcher(genius_token)
	result = genius.fetch(artist, title)
	if result:
	logger.info(f"Lyrics from Genius (plain): {len(result.words)} words")
	return result

	logger.warning(f"No lyrics found for: {artist} - {title}")
	return None