Spaces:
Sleeping
Sleeping
File size: 2,038 Bytes
fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 7633e2f fc7b4a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import re
class LyricsPreprocessor:
"""
A preprocessing class for cleaning and preparing song lyrics
for LLM2Vec.
Parameters
----------
keep_case : bool, optional (default=True)
If False, converts all lyrics to lowercase.
keep_punctuation : bool, optional (default=True)
If False, removes all punctuation from lyrics.
Usage
-----
>>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
>>> processed = preprocessor("Hello, world!\n[Chorus]\nSing along")
>>> print(processed)
"Hello, world! Sing along"
"""
def __init__(self, keep_case=True, keep_punctuation=True):
self.keep_case = keep_case
self.keep_punctuation = keep_punctuation
def __call__(self, lyrics: str):
"""
Preprocess the input lyrics text.
Steps:
1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
2. Applies case handling and punctuation removal based on settings.
3. Builds a cleaned lyrics string.
Parameters
----------
lyrics : str
Raw lyrics text.
Returns
-------
str
a cleaned lyric string
"""
lyrics_cleaned = ""
# Split lyrics by lines
lyric_array = lyrics.split("\n")
for line in lyric_array:
line = line.strip()
# Skip unimportant lines like [Chorus] or (Verse)
if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line):
continue
# Case handling
if not self.keep_case:
line = line.lower()
# Punctuation handling
if not self.keep_punctuation:
line = re.sub(r"[^\w\s]", "", line)
# Normalize to lowercase and split into words
words = line.split()
lyrics_cleaned += " ".join(words) + " "
lyrics_cleaned = lyrics_cleaned.strip()
return lyrics_cleaned
|