rikhoffbauer2 commited on
Commit
1314870
·
verified ·
1 Parent(s): 8b482f1

Upload lyric_sync/lyrics.py

Browse files
Files changed (1) hide show
  1. lyric_sync/lyrics.py +328 -0
lyric_sync/lyrics.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lyrics acquisition from online databases.
3
+
4
+ Fetches reference (correct) lyrics given artist + title.
5
+ Supports synced LRC format and plain text.
6
+
7
+ Priority sources:
8
+ 1. LRCLIB (free, no auth, synced LRC available)
9
+ 2. syncedlyrics library (multi-source aggregator)
10
+ 3. Genius (plain text fallback, requires API key)
11
+ """
12
+
13
+ import logging
14
+ import re
15
+ from dataclasses import dataclass, field
16
+ from typing import Optional
17
+
18
+ import requests
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class LyricLine:
25
+ """A single line of lyrics with optional timing."""
26
+ text: str
27
+ timestamp: Optional[float] = None # seconds (from LRC format)
28
+
29
+ @property
30
+ def words(self) -> list[str]:
31
+ """Split line into words."""
32
+ return self.text.split()
33
+
34
+
35
+ @dataclass
36
+ class Lyrics:
37
+ """Complete lyrics for a song."""
38
+ plain_text: str
39
+ lines: list[LyricLine] = field(default_factory=list)
40
+ synced: bool = False # Whether line-level timestamps are available
41
+ source: str = "unknown"
42
+
43
+ @property
44
+ def words(self) -> list[str]:
45
+ """All words in the lyrics, preserving order."""
46
+ return self.plain_text.split()
47
+
48
+ @property
49
+ def word_count(self) -> int:
50
+ return len(self.words)
51
+
52
+
53
+ def parse_lrc(lrc_text: str) -> list[LyricLine]:
54
+ """
55
+ Parse LRC format into LyricLine objects.
56
+
57
+ LRC format: [MM:SS.cs] Lyrics text here
58
+ Enhanced LRC: [MM:SS.cs] <MM:SS.cs> word <MM:SS.cs> word ...
59
+ """
60
+ lines = []
61
+ # Pattern: [MM:SS.cc] or [MM:SS.ccc]
62
+ pattern = r"\[(\d{2}):(\d{2})\.(\d{2,3})\]\s*(.*)"
63
+
64
+ for raw_line in lrc_text.strip().split("\n"):
65
+ raw_line = raw_line.strip()
66
+ if not raw_line:
67
+ continue
68
+
69
+ match = re.match(pattern, raw_line)
70
+ if match:
71
+ minutes = int(match.group(1))
72
+ seconds = int(match.group(2))
73
+ centiseconds = match.group(3)
74
+ # Handle both 2-digit (centiseconds) and 3-digit (milliseconds)
75
+ if len(centiseconds) == 2:
76
+ frac = int(centiseconds) / 100.0
77
+ else:
78
+ frac = int(centiseconds) / 1000.0
79
+
80
+ timestamp = minutes * 60 + seconds + frac
81
+ text = match.group(4).strip()
82
+
83
+ # Strip enhanced LRC word-level tags if present
84
+ text = re.sub(r"<\d{2}:\d{2}\.\d{2,3}>", "", text).strip()
85
+
86
+ if text: # Skip empty lines (instrumental markers)
87
+ lines.append(LyricLine(text=text, timestamp=timestamp))
88
+ else:
89
+ # Non-timestamped line (metadata like [ar:Artist] or plain text)
90
+ if not raw_line.startswith("["):
91
+ lines.append(LyricLine(text=raw_line))
92
+
93
+ return lines
94
+
95
+
96
+ class LRCLIBFetcher:
97
+ """
98
+ Fetch lyrics from LRCLIB.net — free, no auth, community-maintained.
99
+ Returns both synced LRC and plain text when available.
100
+ """
101
+
102
+ BASE_URL = "https://lrclib.net/api"
103
+
104
+ def fetch(
105
+ self,
106
+ artist: str,
107
+ title: str,
108
+ album: Optional[str] = None,
109
+ duration: Optional[float] = None,
110
+ ) -> Optional[Lyrics]:
111
+ """
112
+ Fetch lyrics by metadata match.
113
+
114
+ Args:
115
+ artist: Artist name
116
+ title: Track title
117
+ album: Album name (optional, improves match accuracy)
118
+ duration: Track duration in seconds (optional)
119
+ """
120
+ params = {
121
+ "artist_name": artist,
122
+ "track_name": title,
123
+ }
124
+ if album:
125
+ params["album_name"] = album
126
+ if duration:
127
+ params["duration"] = int(duration)
128
+
129
+ try:
130
+ resp = requests.get(f"{self.BASE_URL}/get", params=params, timeout=10)
131
+ if resp.status_code == 404:
132
+ logger.debug(f"LRCLIB: no match for {artist} - {title}")
133
+ return None
134
+ resp.raise_for_status()
135
+ data = resp.json()
136
+ except (requests.RequestException, ValueError) as e:
137
+ logger.warning(f"LRCLIB request failed: {e}")
138
+ return None
139
+
140
+ synced_lrc = data.get("syncedLyrics")
141
+ plain = data.get("plainLyrics", "")
142
+
143
+ if synced_lrc:
144
+ lines = parse_lrc(synced_lrc)
145
+ return Lyrics(
146
+ plain_text=plain or "\n".join(l.text for l in lines),
147
+ lines=lines,
148
+ synced=True,
149
+ source="lrclib",
150
+ )
151
+ elif plain:
152
+ lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
153
+ return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")
154
+
155
+ return None
156
+
157
+ def search(self, query: str) -> Optional[Lyrics]:
158
+ """Search LRCLIB by text query (fuzzy)."""
159
+ try:
160
+ resp = requests.get(f"{self.BASE_URL}/search", params={"q": query}, timeout=10)
161
+ if resp.status_code != 200:
162
+ return None
163
+ results = resp.json()
164
+ if not results:
165
+ return None
166
+
167
+ # Take best result
168
+ data = results[0]
169
+ synced_lrc = data.get("syncedLyrics")
170
+ plain = data.get("plainLyrics", "")
171
+
172
+ if synced_lrc:
173
+ lines = parse_lrc(synced_lrc)
174
+ return Lyrics(
175
+ plain_text=plain or "\n".join(l.text for l in lines),
176
+ lines=lines,
177
+ synced=True,
178
+ source="lrclib",
179
+ )
180
+ elif plain:
181
+ lines = [LyricLine(text=line.strip()) for line in plain.split("\n") if line.strip()]
182
+ return Lyrics(plain_text=plain, lines=lines, synced=False, source="lrclib")
183
+ except (requests.RequestException, ValueError) as e:
184
+ logger.debug(f"LRCLIB search failed: {e}")
185
+
186
+ return None
187
+
188
+
189
+ class SyncedLyricsFetcher:
190
+ """
191
+ Multi-source fetcher using the syncedlyrics library.
192
+ Tries: Lrclib → NetEase → Musixmatch → Megalobiz
193
+ """
194
+
195
+ def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
196
+ """Fetch synced lyrics using multiple providers."""
197
+ try:
198
+ import syncedlyrics
199
+ except ImportError:
200
+ logger.warning("syncedlyrics not installed. pip install syncedlyrics")
201
+ return None
202
+
203
+ query = f"{artist} {title}"
204
+ try:
205
+ lrc_text = syncedlyrics.search(
206
+ query,
207
+ providers=["Lrclib", "NetEase", "Musixmatch", "Megalobiz"],
208
+ allow_plain_format=True,
209
+ )
210
+ except Exception as e:
211
+ logger.warning(f"syncedlyrics search failed: {e}")
212
+ return None
213
+
214
+ if not lrc_text:
215
+ return None
216
+
217
+ # Check if it's LRC format (has timestamps)
218
+ if re.search(r"\[\d{2}:\d{2}\.\d{2,3}\]", lrc_text):
219
+ lines = parse_lrc(lrc_text)
220
+ return Lyrics(
221
+ plain_text="\n".join(l.text for l in lines),
222
+ lines=lines,
223
+ synced=True,
224
+ source="syncedlyrics",
225
+ )
226
+ else:
227
+ lines = [LyricLine(text=l.strip()) for l in lrc_text.split("\n") if l.strip()]
228
+ return Lyrics(
229
+ plain_text=lrc_text,
230
+ lines=lines,
231
+ synced=False,
232
+ source="syncedlyrics",
233
+ )
234
+
235
+
236
+ class GeniusFetcher:
237
+ """
238
+ Fetch plain-text lyrics from Genius.
239
+ Requires API token. No synced/timed lyrics available.
240
+ """
241
+
242
+ def __init__(self, token: str):
243
+ self.token = token
244
+
245
+ def fetch(self, artist: str, title: str) -> Optional[Lyrics]:
246
+ """Fetch lyrics from Genius API."""
247
+ try:
248
+ import lyricsgenius
249
+ except ImportError:
250
+ logger.warning("lyricsgenius not installed. pip install lyricsgenius")
251
+ return None
252
+
253
+ try:
254
+ genius = lyricsgenius.Genius(self.token, verbose=False)
255
+ genius.remove_section_headers = True
256
+ song = genius.search_song(title, artist)
257
+ if song and song.lyrics:
258
+ # Clean up Genius formatting artifacts
259
+ text = self._clean_genius_lyrics(song.lyrics)
260
+ lines = [LyricLine(text=l.strip()) for l in text.split("\n") if l.strip()]
261
+ return Lyrics(plain_text=text, lines=lines, synced=False, source="genius")
262
+ except Exception as e:
263
+ logger.warning(f"Genius fetch failed: {e}")
264
+
265
+ return None
266
+
267
+ @staticmethod
268
+ def _clean_genius_lyrics(raw: str) -> str:
269
+ """Remove Genius-specific formatting."""
270
+ # Remove section headers like [Chorus], [Verse 1]
271
+ text = re.sub(r"\[.*?\]", "", raw)
272
+ # Remove "XEmbed" suffix and contributor info
273
+ text = re.sub(r"\d+Embed$", "", text)
274
+ text = re.sub(r"You might also like", "", text)
275
+ # Clean up multiple blank lines
276
+ text = re.sub(r"\n{3,}", "\n\n", text)
277
+ return text.strip()
278
+
279
+
280
+ def fetch_lyrics(
281
+ artist: str,
282
+ title: str,
283
+ album: Optional[str] = None,
284
+ duration: Optional[float] = None,
285
+ genius_token: Optional[str] = None,
286
+ ) -> Optional[Lyrics]:
287
+ """
288
+ Fetch lyrics using the best available source.
289
+
290
+ Priority:
291
+ 1. LRCLIB (free, synced, no auth)
292
+ 2. syncedlyrics (multi-source, synced)
293
+ 3. Genius (plain text, requires token)
294
+
295
+ Args:
296
+ artist: Artist name
297
+ title: Track title
298
+ album: Album name (optional)
299
+ duration: Track duration in seconds (optional)
300
+ genius_token: Genius API token (optional, for fallback)
301
+
302
+ Returns:
303
+ Lyrics object or None
304
+ """
305
+ # 1. LRCLIB
306
+ lrclib = LRCLIBFetcher()
307
+ result = lrclib.fetch(artist, title, album, duration)
308
+ if result:
309
+ logger.info(f"Lyrics from LRCLIB (synced={result.synced}): {len(result.words)} words")
310
+ return result
311
+
312
+ # 2. syncedlyrics multi-source
313
+ synced = SyncedLyricsFetcher()
314
+ result = synced.fetch(artist, title)
315
+ if result:
316
+ logger.info(f"Lyrics from syncedlyrics (synced={result.synced}): {len(result.words)} words")
317
+ return result
318
+
319
+ # 3. Genius (plain text fallback)
320
+ if genius_token:
321
+ genius = GeniusFetcher(genius_token)
322
+ result = genius.fetch(artist, title)
323
+ if result:
324
+ logger.info(f"Lyrics from Genius (plain): {len(result.words)} words")
325
+ return result
326
+
327
+ logger.warning(f"No lyrics found for: {artist} - {title}")
328
+ return None