bach-or-bot / src /preprocessing /lyrics_preprocessor.py
Acelle Krislette Rosales
Initial commit: Added application code
fc7b4a9
raw
history blame
3.52 kB
import re
class LyricsPreprocessor:
"""
A preprocessing class for cleaning and preparing song lyrics
for LLM2Vec.
Parameters
----------
keep_case : bool, optional (default=True)
If False, converts all lyrics to lowercase.
keep_punctuation : bool, optional (default=True)
If False, removes all punctuation from lyrics.
Usage
-----
>>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
>>> processed = preprocessor("Hello, world!\n[Chorus]\nSing along")
>>> print(processed)
"Hello, world! Sing along"
"""
def __init__(self, keep_case=True, keep_punctuation=True):
self.keep_case = keep_case
self.keep_punctuation= keep_punctuation
def __call__(self, lyrics: str):
"""
Preprocess the input lyrics text.
Steps:
1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
2. Applies case handling and punctuation removal based on settings.
3. Builds a cleaned lyrics string.
Parameters
----------
lyrics : str
Raw lyrics text.
Returns
-------
str
a cleaned lyric string
"""
lyrics_cleaned = ""
# Split lyrics by lines
lyric_array = lyrics.split('\n')
for line in lyric_array:
line = line.strip()
# Skip unimportant lines like [Chorus] or (Verse)
if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
continue
# Case handling
if not self.keep_case:
line = line.lower()
# Punctuation handling
if not self.keep_punctuation:
line = re.sub(r'[^\w\s]', '', line)
# Normalize to lowercase and split into words
words = line.split()
lyrics_cleaned += ' '.join(words) + ' '
lyrics_cleaned = lyrics_cleaned.strip()
return lyrics_cleaned
def musiclime_lyrics_extractor(self, lyrics: str):
"""
Preprocess the input lyrics text.
Steps:
1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
2. Applies case handling and punctuation removal based on settings.
3. Segments the lyrics into multiple lines.
3. Builds a list of lines from the lyrics
Parameters
----------
lyrics : str
Raw lyrics text.
Returns
-------
line_segmented_lyrics : list
List of lines from the lyrics, processed using the class.
"""
# Instantiate line lyrics list
line_segmented_lyrics = []
# Split lyrics by lines
lyric_array = lyrics.split('\n')
for line in lyric_array:
line = line.strip()
# Skip unimportant lines like [Chorus] or (Verse)
if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
continue
# Case handling
if not self.keep_case:
line = line.lower()
# Punctuation handling
if not self.keep_punctuation:
line = re.sub(r'[^\w\s]', '', line)
# Append line to line segmented lyrics list
line_segmented_lyrics.append(line)
return line_segmented_lyrics